From f9f4fd5e75bea9e90468077acb7bc03f79fc3e0f Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Tue, 10 May 2022 20:28:05 +0200
Subject: [PATCH 01/78] Move compatiblity-related code into a separate `compat`
 module (#652)

* Add `compat` module to encapsulate imports of optional 3rd party frameworks/libraries

* Replace references to compat code in `.util` with references to `.compat`
Remove `cupy_ops. has_cupy` , `backends.has_cupy`, and `api.has_cupy`

* Update example notebook

* `util.set_active_gpu`: Return `None` if GPU is unavailable

* `util`: Import tensorflow and mxnet with shorthand names
Fix markdown formatting

* `api`: Re-export `has_cupy` from `compat`

* `backends`: Preserve `has_cupy` export for bwd-compat, remove superfluous imports

* Revert "Update example notebook"

This reverts commit 9f068a4fd71e0daf5888d07e172cdea8ffb31a23.

* `util`: Revert changes to `set_active_gpu`, raise an error if no GPU is detected
Clarify docs
---
 thinc/api.py                                  |  3 +-
 thinc/backends/__init__.py                    |  9 +-
 thinc/backends/_cupy_allocators.py            | 25 ++----
 thinc/backends/_custom_kernels.py             | 12 +--
 thinc/backends/cupy_ops.py                    | 16 +---
 thinc/compat.py                               | 70 ++++++++++++++++
 thinc/layers/tensorflowwrapper.py             |  6 +-
 thinc/shims/mxnet.py                          | 12 +--
 thinc/shims/pytorch.py                        | 14 +---
 thinc/shims/pytorch_grad_scaler.py            |  8 +-
 thinc/shims/tensorflow.py                     | 17 +---
 thinc/tests/backends/test_ops.py              |  3 +-
 thinc/tests/layers/test_layers_api.py         |  2 +-
 thinc/tests/layers/test_lstm.py               |  2 +-
 thinc/tests/layers/test_mnist.py              |  3 +-
 thinc/tests/layers/test_mxnet_wrapper.py      |  3 +-
 thinc/tests/layers/test_pytorch_wrapper.py    |  2 +-
 thinc/tests/layers/test_tensorflow_wrapper.py |  3 +-
 thinc/tests/model/test_model.py               |  7 +-
 thinc/tests/regression/test_issue564.py       |  2 +-
 thinc/tests/shims/test_pytorch_grad_scaler.py | 13 +--
 thinc/types.py                                |  7 +-
 thinc/util.py                                 | 84 ++++---------------
 website/docs/api-util.md                      | 11 +--
 24 files changed, 141 insertions(+), 193 deletions(-)
 create mode 100644 thinc/compat.py

diff --git a/thinc/api.py b/thinc/api.py
index 8e1ea4bf9..204f39b4e 100644
--- a/thinc/api.py
+++ b/thinc/api.py
@@ -16,8 +16,9 @@
 from .util import DataValidationError, data_validation
 from .util import to_categorical, get_width, get_array_module, to_numpy
 from .util import torch2xp, xp2torch, tensorflow2xp, xp2tensorflow, mxnet2xp, xp2mxnet
+from .compat import has_cupy
 from .backends import get_ops, set_current_ops, get_current_ops, use_ops
-from .backends import Ops, CupyOps, NumpyOps, has_cupy, set_gpu_allocator
+from .backends import Ops, CupyOps, NumpyOps, set_gpu_allocator
 from .backends import use_pytorch_for_gpu_memory, use_tensorflow_for_gpu_memory
 
 from .layers import Dropout, Embed, expand_window, HashEmbed, LayerNorm, Linear
diff --git a/thinc/backends/__init__.py b/thinc/backends/__init__.py
index ba4daaa8a..d1219796d 100644
--- a/thinc/backends/__init__.py
+++ b/thinc/backends/__init__.py
@@ -5,13 +5,14 @@
 import threading
 
 from .ops import Ops
-from .cupy_ops import CupyOps, has_cupy
+from .cupy_ops import CupyOps
 from .numpy_ops import NumpyOps
 from ._cupy_allocators import cupy_tensorflow_allocator, cupy_pytorch_allocator
 from ._param_server import ParamServer
 from ..util import assert_tensorflow_installed, assert_pytorch_installed
 from ..util import is_cupy_array, set_torch_tensor_type_for_ops, require_cpu
 from .. import registry
+from ..compat import cupy, has_cupy
 
 
 context_ops: ContextVar[Optional[Ops]] = ContextVar("context_ops", default=None)
@@ -46,8 +47,6 @@ def use_pytorch_for_gpu_memory() -> None:  # pragma: no cover
     We'd like to support routing Tensorflow memory allocation via PyTorch as well
     (or vice versa), but do not currently have an implementation for it.
     """
-    import cupy.cuda
-
     assert_pytorch_installed()
     pools = context_pools.get()
     if "pytorch" not in pools:
@@ -65,8 +64,6 @@ def use_tensorflow_for_gpu_memory() -> None:  # pragma: no cover
     We'd like to support routing PyTorch memory allocation via Tensorflow as
     well (or vice versa), but do not currently have an implementation for it.
     """
-    import cupy.cuda
-
     assert_tensorflow_installed()
     pools = context_pools.get()
     if "tensorflow" not in pools:
@@ -94,7 +91,7 @@ def get_ops(name: str, **kwargs) -> Ops:
 
     cls: Optional[Callable[..., Ops]] = None
     if name == "cpu":
-        _import_extra_cpu_backends()        
+        _import_extra_cpu_backends()
         cls = ops_by_name.get("numpy")
         cls = ops_by_name.get("apple", cls)
         cls = ops_by_name.get("bigendian", cls)
diff --git a/thinc/backends/_cupy_allocators.py b/thinc/backends/_cupy_allocators.py
index 7bcb8bd5c..a9f000c6c 100644
--- a/thinc/backends/_cupy_allocators.py
+++ b/thinc/backends/_cupy_allocators.py
@@ -2,22 +2,7 @@
 
 from ..types import ArrayXd
 from ..util import tensorflow2xp
-
-try:
-    import tensorflow
-except ImportError:
-    pass
-
-try:
-    import torch
-except ImportError:
-    pass
-
-try:
-    from cupy.cuda.memory import MemoryPointer
-    from cupy.cuda.memory import UnownedMemory
-except ImportError:
-    pass
+from ..compat import torch, cupy, tensorflow
 
 
 def cupy_tensorflow_allocator(size_in_bytes: int):
@@ -32,9 +17,9 @@ def cupy_tensorflow_allocator(size_in_bytes: int):
     cupy_array = cast(ArrayXd, tensorflow2xp(tensor))
     address = int(cupy_array.data)
     # cupy has a neat class to help us here. Otherwise it will try to free.
-    memory = UnownedMemory(address, size_in_bytes, cupy_array)
+    memory = cupy.cuda.memory.UnownedMemory(address, size_in_bytes, cupy_array)
     # Now return a new memory pointer.
-    return MemoryPointer(memory, 0)
+    return cupy.cuda.memory.MemoryPointer(memory, 0)
 
 
 def cupy_pytorch_allocator(size_in_bytes: int):
@@ -53,6 +38,6 @@ def cupy_pytorch_allocator(size_in_bytes: int):
     # cupy has a neat class to help us here. Otherwise it will try to free.
     # I think this is a private API? It's not in the types.
     address = torch_tensor.data_ptr()  # type: ignore
-    memory = UnownedMemory(address, size_in_bytes, torch_tensor)
+    memory = cupy.cuda.memory.UnownedMemory(address, size_in_bytes, torch_tensor)
     # Now return a new memory pointer.
-    return MemoryPointer(memory, 0)
+    return cupy.cuda.memory.MemoryPointer(memory, 0)
diff --git a/thinc/backends/_custom_kernels.py b/thinc/backends/_custom_kernels.py
index d5456114a..8ea13bb57 100644
--- a/thinc/backends/_custom_kernels.py
+++ b/thinc/backends/_custom_kernels.py
@@ -2,11 +2,7 @@
 import re
 from pathlib import Path
 from collections import defaultdict
-
-try:
-    import cupy
-except ImportError:
-    cupy = None
+from ..compat import cupy, has_cupy_gpu
 
 
 PWD = Path(__file__).parent
@@ -55,7 +51,7 @@
     cupy.RawModule(
         code=KERNELS_SRC, options=("--std=c++11",), name_expressions=KERNELS_LIST
     )
-    if cupy is not None
+    if has_cupy_gpu
     else None
 )
 
@@ -70,7 +66,7 @@ def _get_kernel(name):
 
 
 def compile_mmh(src):
-    if cupy is None:
+    if not has_cupy_gpu:
         return None
     return cupy.RawKernel(src, "hash_data")
 
@@ -672,7 +668,7 @@ def _check_which_maxout(which, B: int, I: int, P: int):
         "true",
         "within_range",
     )
-    if cupy is not None
+    if has_cupy_gpu
     else None
 )
 
diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py
index 709775ddd..ec70cf72e 100644
--- a/thinc/backends/cupy_ops.py
+++ b/thinc/backends/cupy_ops.py
@@ -1,19 +1,4 @@
 import numpy
-
-try:
-    import cupy
-    import cupyx
-    import cupy.cuda
-    from cupy.cuda.compiler import compile_with_cache  # noqa: F401
-
-    has_cupy = True
-
-    # We no longer have to set up the memory pool, fortunately.
-except ImportError:
-    cupy = None
-    cupyx = None
-    has_cupy = False
-
 from .. import registry
 from .ops import Ops
 from .numpy_ops import NumpyOps
@@ -22,6 +7,7 @@
 from ..util import torch2xp, tensorflow2xp, mxnet2xp
 from ..util import is_cupy_array
 from ..util import is_torch_gpu_array, is_tensorflow_gpu_array, is_mxnet_gpu_array
+from ..compat import cupy, cupyx
 
 
 @registry.ops("CupyOps")
diff --git a/thinc/compat.py b/thinc/compat.py
new file mode 100644
index 000000000..467698df1
--- /dev/null
+++ b/thinc/compat.py
@@ -0,0 +1,70 @@
+from packaging.version import Version
+
+try:  # pragma: no cover
+    import cupy
+    import cupyx
+
+    has_cupy = True
+    cupy_version = Version(cupy.__version__)
+    try:
+        cupy.cuda.runtime.getDeviceCount()
+        has_cupy_gpu = True
+    except cupy.cuda.runtime.CUDARuntimeError:
+        has_cupy_gpu = False
+
+    if cupy_version.major >= 10:
+        # fromDlpack was deprecated in v10.0.0.
+        cupy_from_dlpack = cupy.from_dlpack
+    else:
+        cupy_from_dlpack = cupy.fromDlpack
+except (ImportError, AttributeError):
+    cupy = None
+    cupyx = None
+    cupy_version = Version("0.0.0")
+    has_cupy = False
+    cupy_from_dlpack = None
+    has_cupy_gpu = False
+
+
+try:  # pragma: no cover
+    import torch.utils.dlpack
+    import torch
+
+    has_torch = True
+    has_torch_gpu = torch.cuda.device_count() != 0
+    torch_version = Version(str(torch.__version__))
+    has_torch_amp = (
+        torch_version >= Version("1.9.0")
+        and not torch.cuda.amp.common.amp_definitely_not_available()
+    )
+except ImportError:  # pragma: no cover
+    torch = None
+    has_torch = False
+    has_torch_gpu = False
+    has_torch_amp = False
+    torch_version = Version("0.0.0")
+
+try:  # pragma: no cover
+    import tensorflow.experimental.dlpack
+    import tensorflow
+
+    has_tensorflow = True
+    has_tensorflow_gpu = len(tensorflow.config.get_visible_devices("GPU")) > 0
+except ImportError:  # pragma: no cover
+    tensorflow = None
+    has_tensorflow = False
+    has_tensorflow_gpu = False
+
+
+try:  # pragma: no cover
+    import mxnet
+
+    has_mxnet = True
+except ImportError:  # pragma: no cover
+    mxnet = None
+    has_mxnet = False
+
+try:
+    import h5py
+except ImportError:  # pragma: no cover
+    h5py = None
diff --git a/thinc/layers/tensorflowwrapper.py b/thinc/layers/tensorflowwrapper.py
index dc1a48752..7e166ea50 100644
--- a/thinc/layers/tensorflowwrapper.py
+++ b/thinc/layers/tensorflowwrapper.py
@@ -7,11 +7,7 @@
 from ..util import xp2tensorflow, tensorflow2xp, assert_tensorflow_installed
 from ..util import is_tensorflow_array, convert_recursive, is_xp_array
 from ..types import ArrayXd, ArgsKwargs
-
-try:
-    import tensorflow as tf
-except ImportError:  # pragma: no cover
-    pass
+from ..compat import tensorflow as tf
 
 InT = TypeVar("InT")
 OutT = TypeVar("OutT")
diff --git a/thinc/shims/mxnet.py b/thinc/shims/mxnet.py
index 0357be685..3962a2ef5 100644
--- a/thinc/shims/mxnet.py
+++ b/thinc/shims/mxnet.py
@@ -2,18 +2,12 @@
 import srsly
 import copy
 
-try:
-    import mxnet.autograd
-    import mxnet.optimizer
-    import mxnet as mx
-except ImportError:  # pragma: no cover
-    pass
-
 from ..util import mxnet2xp, convert_recursive, make_tempfile, xp2mxnet
 from ..util import get_array_module
 from ..optimizers import Optimizer
 from ..types import ArgsKwargs, FloatsXd
 from .shim import Shim
+from ..compat import mxnet as mx
 
 
 class MXNetShim(Shim):
@@ -33,7 +27,7 @@ def predict(self, inputs: ArgsKwargs) -> Any:
         evaluation mode.
         """
         mx.autograd.set_training(train_mode=False)
-        with mxnet.autograd.pause():
+        with mx.autograd.pause():
             outputs = self._model(*inputs.args, **inputs.kwargs)
         mx.autograd.set_training(train_mode=True)
         return outputs
@@ -50,7 +44,7 @@ def begin_update(self, inputs: ArgsKwargs):
 
         def backprop(grads):
             mx.autograd.set_recording(False)
-            mxnet.autograd.backward(*grads.args, **grads.kwargs)
+            mx.autograd.backward(*grads.args, **grads.kwargs)
             return convert_recursive(
                 lambda x: hasattr(x, "grad"), lambda x: x.grad, inputs
             )
diff --git a/thinc/shims/pytorch.py b/thinc/shims/pytorch.py
index 03a08da83..01df1d5f5 100644
--- a/thinc/shims/pytorch.py
+++ b/thinc/shims/pytorch.py
@@ -4,16 +4,8 @@
 import itertools
 import srsly
 
-try:
-    import torch.autograd
-    from torch.cuda import amp
-    import torch.optim
-    import torch
-except ImportError:  # pragma: no cover
-    pass
-
 from ..util import torch2xp, xp2torch, convert_recursive, iterate_recursive
-from ..util import has_torch_amp
+from ..compat import torch
 from ..backends import get_current_ops, context_pools, CupyOps
 from ..backends import set_gpu_allocator
 from ..optimizers import Optimizer
@@ -73,7 +65,7 @@ def predict(self, inputs: ArgsKwargs) -> Any:
         """
         self._model.eval()
         with torch.no_grad():
-            with amp.autocast(self._mixed_precision):
+            with torch.cuda.amp.autocast(self._mixed_precision):
                 outputs = self._model(*inputs.args, **inputs.kwargs)
         self._model.train()
         return outputs
@@ -87,7 +79,7 @@ def begin_update(self, inputs: ArgsKwargs):
         self._model.train()
 
         # Note: mixed-precision autocast must not be applied to backprop.
-        with amp.autocast(self._mixed_precision):
+        with torch.cuda.amp.autocast(self._mixed_precision):
             output = self._model(*inputs.args, **inputs.kwargs)
 
         def backprop(grads):
diff --git a/thinc/shims/pytorch_grad_scaler.py b/thinc/shims/pytorch_grad_scaler.py
index 9fc77209e..999ed0047 100644
--- a/thinc/shims/pytorch_grad_scaler.py
+++ b/thinc/shims/pytorch_grad_scaler.py
@@ -1,11 +1,7 @@
 from typing import Dict, Iterable, List, Union, cast
 
-from ..util import has_torch_amp, is_torch_array
-
-try:
-    import torch
-except ImportError:  # pragma: no cover
-    pass
+from ..compat import has_torch_amp, torch
+from ..util import is_torch_array
 
 
 class PyTorchGradScaler:
diff --git a/thinc/shims/tensorflow.py b/thinc/shims/tensorflow.py
index f226c48df..d630d86f9 100644
--- a/thinc/shims/tensorflow.py
+++ b/thinc/shims/tensorflow.py
@@ -10,21 +10,8 @@
 from ..types import ArgsKwargs, ArrayXd
 from ..util import get_array_module
 from .shim import Shim
-
-try:
-    import cupy
-except ImportError:
-    cupy = None
-
-try:
-    import tensorflow as tf
-except ImportError:  # pragma: no cover
-    pass
-
-try:
-    import h5py
-except ImportError:  # pragma: no cover
-    pass
+from ..compat import tensorflow as tf
+from ..compat import cupy, h5py
 
 keras_model_fns = catalogue.create("thinc", "keras", entry_points=True)
 
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index 7ab5496be..cdc319b91 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -8,7 +8,8 @@
 from packaging.version import Version
 from thinc.api import NumpyOps, CupyOps, Ops, get_ops
 from thinc.api import get_current_ops, use_ops
-from thinc.util import has_torch, torch2xp, xp2torch, torch_version, gpu_is_available
+from thinc.util import torch2xp, xp2torch, gpu_is_available
+from thinc.compat import has_torch, torch_version
 from thinc.api import fix_random_seed
 from thinc.api import LSTM
 from thinc.types import Floats2d
diff --git a/thinc/tests/layers/test_layers_api.py b/thinc/tests/layers/test_layers_api.py
index e612cf318..5c922f074 100644
--- a/thinc/tests/layers/test_layers_api.py
+++ b/thinc/tests/layers/test_layers_api.py
@@ -5,7 +5,7 @@
 from thinc.backends import NumpyOps
 from thinc.util import data_validation, get_width
 from thinc.types import Ragged, Padded, Array2d, Floats2d, FloatsXd, Shape
-from thinc.util import has_torch
+from thinc.compat import has_torch
 import numpy
 import pytest
 
diff --git a/thinc/tests/layers/test_lstm.py b/thinc/tests/layers/test_lstm.py
index 850f569b8..208ffb58b 100644
--- a/thinc/tests/layers/test_lstm.py
+++ b/thinc/tests/layers/test_lstm.py
@@ -2,7 +2,7 @@
 import timeit
 from thinc.api import NumpyOps, LSTM, PyTorchLSTM, with_padded, fix_random_seed
 from thinc.api import Ops
-from thinc.util import has_torch
+from thinc.compat import has_torch
 import pytest
 
 
diff --git a/thinc/tests/layers/test_mnist.py b/thinc/tests/layers/test_mnist.py
index 0ed0dfd64..b67a7fd57 100644
--- a/thinc/tests/layers/test_mnist.py
+++ b/thinc/tests/layers/test_mnist.py
@@ -1,13 +1,14 @@
 import pytest
 from thinc.api import Relu, Softmax, chain, clone, Adam
 from thinc.api import PyTorchWrapper, TensorFlowWrapper
-from thinc.util import has_torch, has_tensorflow
+from thinc.compat import has_torch, has_tensorflow
 
 
 @pytest.fixture(scope="module")
 def mnist(limit=5000):
     pytest.importorskip("ml_datasets")
     import ml_datasets
+
     (train_X, train_Y), (dev_X, dev_Y) = ml_datasets.mnist()
     return (train_X[:limit], train_Y[:limit]), (dev_X[:limit], dev_Y[:limit])
 
diff --git a/thinc/tests/layers/test_mxnet_wrapper.py b/thinc/tests/layers/test_mxnet_wrapper.py
index 438b28f33..78d107dd3 100644
--- a/thinc/tests/layers/test_mxnet_wrapper.py
+++ b/thinc/tests/layers/test_mxnet_wrapper.py
@@ -5,7 +5,8 @@
 from thinc.api import Adam, ArgsKwargs, Model, Ops, MXNetWrapper
 from thinc.api import get_current_ops, mxnet2xp, xp2mxnet
 from thinc.types import Array2d, Array1d, IntsXd
-from thinc.util import has_cupy, has_mxnet, to_categorical
+from thinc.compat import has_cupy, has_mxnet
+from thinc.util import to_categorical
 
 from ..util import check_input_converters, make_tempdir
 
diff --git a/thinc/tests/layers/test_pytorch_wrapper.py b/thinc/tests/layers/test_pytorch_wrapper.py
index ce3b6ae8d..1b37c29c9 100644
--- a/thinc/tests/layers/test_pytorch_wrapper.py
+++ b/thinc/tests/layers/test_pytorch_wrapper.py
@@ -3,7 +3,7 @@
 from thinc.api import chain, get_current_ops, Relu
 from thinc.backends import context_pools
 from thinc.shims.pytorch_grad_scaler import PyTorchGradScaler
-from thinc.util import has_torch, has_torch_amp, has_torch_gpu
+from thinc.compat import has_torch, has_torch_amp, has_torch_gpu
 import numpy
 import pytest
 
diff --git a/thinc/tests/layers/test_tensorflow_wrapper.py b/thinc/tests/layers/test_tensorflow_wrapper.py
index 1c10b8242..ff254ffc0 100644
--- a/thinc/tests/layers/test_tensorflow_wrapper.py
+++ b/thinc/tests/layers/test_tensorflow_wrapper.py
@@ -2,7 +2,8 @@
 import pytest
 from thinc.api import Adam, ArgsKwargs, Linear, Model, TensorFlowWrapper
 from thinc.api import get_current_ops, keras_subclass, tensorflow2xp, xp2tensorflow
-from thinc.util import gpu_is_available, has_tensorflow, to_categorical
+from thinc.util import gpu_is_available, to_categorical
+from thinc.compat import has_tensorflow
 
 from ..util import check_input_converters, make_tempdir
 
diff --git a/thinc/tests/model/test_model.py b/thinc/tests/model/test_model.py
index c552e3e4c..3db857987 100644
--- a/thinc/tests/model/test_model.py
+++ b/thinc/tests/model/test_model.py
@@ -4,9 +4,10 @@
 import time
 from thinc.api import Adam, CupyOps, Dropout, Linear, Model, Relu
 from thinc.api import Shim, Softmax, chain, change_attr_values
-from thinc.api import concatenate, has_cupy, set_dropout_rate
+from thinc.api import concatenate, set_dropout_rate
 from thinc.api import use_ops, with_debug, wrap_model_recursive
 from thinc.util import gpu_is_available
+from thinc.compat import has_cupy
 import numpy
 
 from ..util import make_tempdir
@@ -349,10 +350,10 @@ def test_all_operators(op):
             with pytest.raises(TypeError):
                 value = m1 % m2
         if op == "**":
-            value = m1 ** m2
+            value = m1**m2
         else:
             with pytest.raises(TypeError):
-                value = m1 ** m2
+                value = m1**m2
         if op == "<<":
             value = m1 << m2
         else:
diff --git a/thinc/tests/regression/test_issue564.py b/thinc/tests/regression/test_issue564.py
index 895d25cfa..bd046c501 100644
--- a/thinc/tests/regression/test_issue564.py
+++ b/thinc/tests/regression/test_issue564.py
@@ -1,7 +1,7 @@
 import pytest
 
 from thinc.api import CupyOps
-from thinc.util import has_torch, has_torch_gpu
+from thinc.compat import has_torch, has_torch_gpu
 
 
 @pytest.mark.skipif(not has_torch, reason="needs PyTorch")
diff --git a/thinc/tests/shims/test_pytorch_grad_scaler.py b/thinc/tests/shims/test_pytorch_grad_scaler.py
index 26eab9291..0fd709fdc 100644
--- a/thinc/tests/shims/test_pytorch_grad_scaler.py
+++ b/thinc/tests/shims/test_pytorch_grad_scaler.py
@@ -2,7 +2,7 @@
 
 from hypothesis import given, settings
 from hypothesis.strategies import lists, one_of, tuples
-from thinc.util import has_torch, has_torch_amp, has_torch_gpu
+from thinc.compat import has_torch, has_torch_amp, has_torch_gpu, torch
 from thinc.util import is_torch_array
 from thinc.api import PyTorchGradScaler
 
@@ -10,15 +10,7 @@
 
 
 def tensors():
-    # This function is not used without Torch + CUDA,
-    # but we have to do some wrapping to avoid import
-    # failures.
-    try:
-        import torch
-
-        return ndarrays().map(lambda a: torch.tensor(a).cuda())
-    except ImportError:
-        pass
+    return ndarrays().map(lambda a: torch.tensor(a).cuda())
 
 
 @pytest.mark.skipif(not has_torch, reason="needs PyTorch")
@@ -97,6 +89,7 @@ def test_grad_scaler():
 )
 def test_raises_on_old_pytorch():
     import torch
+
     scaler = PyTorchGradScaler(enabled=True)
     with pytest.raises(ValueError, match=r"not supported.*1.9.0"):
         scaler.scale([torch.tensor([1.0], device="cpu")])
diff --git a/thinc/types.py b/thinc/types.py
index 74498d159..04b81946a 100644
--- a/thinc/types.py
+++ b/thinc/types.py
@@ -4,12 +4,11 @@
 from dataclasses import dataclass
 import numpy
 import sys
+from .compat import has_cupy, cupy
 
-try:
-    import cupy
-
+if has_cupy:
     get_array_module = cupy.get_array_module
-except (ImportError, AttributeError):
+else:
     get_array_module = lambda obj: numpy
 
 # Use typing_extensions for Python versions < 3.8
diff --git a/thinc/util.py b/thinc/util.py
index dacfb09c2..16f0f0196 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -13,54 +13,12 @@
 import contextlib
 from contextvars import ContextVar
 from dataclasses import dataclass
+from .compat import has_cupy, has_mxnet, has_torch, has_tensorflow
+from .compat import has_cupy_gpu, has_torch_gpu
+from .compat import torch, cupy, tensorflow as tf, mxnet as mx, cupy_from_dlpack
 
 DATA_VALIDATION: ContextVar[bool] = ContextVar("DATA_VALIDATION", default=False)
 
-try:  # pragma: no cover
-    import cupy
-
-    has_cupy = True
-except (ImportError, AttributeError):
-    cupy = None
-    has_cupy = False
-
-
-try:  # pragma: no cover
-    import torch
-    from torch import tensor
-    import torch.utils.dlpack
-
-    has_torch = True
-    has_torch_gpu = torch.cuda.device_count() != 0
-    torch_version = Version(str(torch.__version__))
-    has_torch_amp = (
-        torch_version >= Version("1.9.0")
-        and not torch.cuda.amp.common.amp_definitely_not_available()
-    )
-except ImportError:  # pragma: no cover
-    has_torch = False
-    has_torch_gpu = False
-    has_torch_amp = False
-    torch_version = Version("0.0.0")
-
-try:  # pragma: no cover
-    import tensorflow.experimental.dlpack
-    import tensorflow as tf
-
-    has_tensorflow = True
-    has_tensorflow_gpu = len(tf.config.get_visible_devices("GPU")) > 0
-except ImportError:  # pragma: no cover
-    has_tensorflow = False
-    has_tensorflow_gpu = False
-
-
-try:  # pragma: no cover
-    import mxnet as mx
-
-    has_mxnet = True
-except ImportError:  # pragma: no cover
-    has_mxnet = False
-
 from .types import ArrayXd, ArgsKwargs, Ragged, Padded, FloatsXd, IntsXd  # noqa: E402
 from . import types  # noqa: E402
 from typing import TYPE_CHECKING
@@ -77,14 +35,7 @@ def get_array_module(arr):  # pragma: no cover
 
 
 def gpu_is_available():
-    if not has_cupy:
-        return False
-
-    try:
-        cupy.cuda.runtime.getDeviceCount()
-        return True
-    except cupy.cuda.runtime.CUDARuntimeError:
-        return False
+    return has_cupy_gpu
 
 
 def fix_random_seed(seed: int = 0) -> None:  # pragma: no cover
@@ -95,7 +46,7 @@ def fix_random_seed(seed: int = 0) -> None:  # pragma: no cover
         torch.manual_seed(seed)
     if has_cupy and gpu_is_available():
         cupy.random.seed(seed)
-        if has_torch and torch.cuda.is_available():
+        if has_torch and has_torch_gpu:
             torch.cuda.manual_seed_all(seed)
             torch.backends.cudnn.deterministic = True
             torch.backends.cudnn.benchmark = False
@@ -174,17 +125,16 @@ def to_numpy(data):  # pragma: no cover
 
 def set_active_gpu(gpu_id: int) -> "cupy.cuda.Device":  # pragma: no cover
     """Set the current GPU device for cupy and torch (if available)."""
-    import cupy.cuda.device
+    if not gpu_is_available():
+        raise ValueError("No GPU devices detected")
 
     device = cupy.cuda.device.Device(gpu_id)
     device.use()
-    try:
-        import torch
 
+    if has_torch_gpu:
         torch.cuda.set_device(gpu_id)
         torch.set_default_tensor_type("torch.cuda.FloatTensor")
-    except ImportError:
-        pass
+
     return device
 
 
@@ -203,7 +153,7 @@ def prefer_gpu(gpu_id: int = 0) -> bool:  # pragma: no cover
     """Use GPU if it's available. Returns True if so, False otherwise."""
     from .backends.cupy_ops import CupyOps
 
-    if CupyOps.xp is None:
+    if not gpu_is_available():
         return False
     else:
         require_gpu(gpu_id=gpu_id)
@@ -213,8 +163,8 @@ def prefer_gpu(gpu_id: int = 0) -> bool:  # pragma: no cover
 def require_gpu(gpu_id: int = 0) -> bool:  # pragma: no cover
     from .backends import set_current_ops, CupyOps
 
-    if CupyOps.xp is None:
-        raise ValueError("GPU is not accessible. Was the library installed correctly?")
+    if not gpu_is_available():
+        raise ValueError("No GPU devices detected")
 
     set_current_ops(CupyOps())
     set_active_gpu(gpu_id)
@@ -386,7 +336,7 @@ def torch2xp(
         if isinstance(ops, NumpyOps):
             return torch_tensor.detach().cpu().numpy()
         else:
-            return cupy.fromDlpack(torch.utils.dlpack.to_dlpack(torch_tensor))
+            return cupy_from_dlpack(torch.utils.dlpack.to_dlpack(torch_tensor))
     else:
         if isinstance(ops, NumpyOps) or ops is None:
             return torch_tensor.detach().numpy()
@@ -401,7 +351,7 @@ def xp2tensorflow(
     assert_tensorflow_installed()
     if hasattr(xp_tensor, "toDlpack"):
         dlpack_tensor = xp_tensor.toDlpack()  # type: ignore
-        tf_tensor = tensorflow.experimental.dlpack.from_dlpack(dlpack_tensor)
+        tf_tensor = tf.experimental.dlpack.from_dlpack(dlpack_tensor)
     else:
         tf_tensor = tf.convert_to_tensor(xp_tensor)
     if as_variable:
@@ -430,8 +380,8 @@ def tensorflow2xp(
         if isinstance(ops, NumpyOps):
             return tf_tensor.numpy()
         else:
-            dlpack_tensor = tensorflow.experimental.dlpack.to_dlpack(tf_tensor)
-            return cupy.fromDlpack(dlpack_tensor)
+            dlpack_tensor = tf.experimental.dlpack.to_dlpack(tf_tensor)
+            return cupy_from_dlpack(dlpack_tensor)
     else:
         if isinstance(ops, NumpyOps) or ops is None:
             return tf_tensor.numpy()
@@ -465,7 +415,7 @@ def mxnet2xp(
         if isinstance(ops, NumpyOps):
             return mx_tensor.detach().asnumpy()
         else:
-            return cupy.fromDlpack(mx_tensor.to_dlpack_for_write())
+            return cupy_from_dlpack(mx_tensor.to_dlpack_for_write())
     else:
         if isinstance(ops, NumpyOps) or ops is None:
             return mx_tensor.detach().asnumpy()
diff --git a/website/docs/api-util.md b/website/docs/api-util.md
index e775fae78..158da5446 100644
--- a/website/docs/api-util.md
+++ b/website/docs/api-util.md
@@ -20,9 +20,9 @@ fix_random_seed(0)
 
 ### require_cpu {#require_cpu tag="function"}
 
-Allocate data and perform operations on CPU. 
-If data has already been allocated on GPU, it will not be moved.
-Ideally, this function should be called right after importing Thinc.
+Allocate data and perform operations on CPU. If data has already been allocated
+on GPU, it will not be moved. Ideally, this function should be called right
+after importing Thinc.
 
 ```python
 ### Example
@@ -69,7 +69,8 @@ require_gpu()
 
 ### set_active_gpu {#set_active_gpu tag="function"}
 
-Set the current GPU device for `cupy` and `torch` (if available).
+Set the current GPU device for `cupy` (and for `torch`, if installed) and return
+a `cupy` device. Will raise an error if no GPU is available.
 
 ```python
 ### Example
@@ -165,7 +166,7 @@ Convert a `numpy` or `cupy` tensor to a TensorFlow tensor.
 | --------------- | -------------------------- | ----------------------------------------------------- |
 | `xp_tensor`     | <tt>ArrayXd</tt>           | The tensor to convert.                                |
 | `requires_grad` | <tt>bool</tt>              | Whether to backpropagate through the variable.        |
-| `as_variable`   | <tt>bool</tt>              | Convert the result to a `tensorflow.Variable` object. |  |
+| `as_variable`   | <tt>bool</tt>              | Convert the result to a `tensorflow.Variable` object. |
 | **RETURNS**     | <tt>tensorflow.Tensor</tt> | The converted tensor.                                 |
 
 ### tensorflow2xp {#tensorflow2xp tag="function"}

From ffd998c7bc5d81926532f20b51f3e0789e061398 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 10 May 2022 20:59:33 +0200
Subject: [PATCH 02/78] NumpyOps: Add a method to get a table of C BLAS
 functions (#643)

* NumpyOps: Add a method to get a table of C BLAS functions

This table can be used for downstream `cdef nogil` functions that need
to use a BLAS function from the BLAS implementation used by an Ops
subclass.

* Bump blis requiment to >=0.9.0,<0.10.0

* NumpyOps: do not construct CBlas on every NumpyOps.cblas() call

* api-backends: Fix superfluous wording
---
 pyproject.toml               |  2 +-
 requirements.txt             |  2 +-
 setup.cfg                    |  4 ++--
 setup.py                     |  1 +
 thinc/backends/cblas.pxd     | 24 ++++++++++++++++++++++++
 thinc/backends/cblas.pyx     | 32 ++++++++++++++++++++++++++++++++
 thinc/backends/numpy_ops.pyx |  7 +++++++
 thinc/backends/ops.py        |  6 ++++++
 website/docs/api-backends.md | 20 ++++++++++++++++++++
 9 files changed, 94 insertions(+), 4 deletions(-)
 create mode 100644 thinc/backends/cblas.pxd
 create mode 100644 thinc/backends/cblas.pyx

diff --git a/pyproject.toml b/pyproject.toml
index ff13274ee..3b973098d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "murmurhash>=1.0.2,<1.1.0",
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
-    "blis>=0.4.0,<0.8.0",
+    "blis>=0.9.0,<0.10.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 4a31fc017..9e57d60f4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 murmurhash>=1.0.2,<1.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-blis>=0.4.0,<0.8.0
+blis>=0.9.0,<0.10.0
 srsly>=2.4.0,<3.0.0
 wasabi>=0.8.1,<1.1.0
 catalogue>=2.0.4,<2.1.0
diff --git a/setup.cfg b/setup.cfg
index 4e535d899..5351aa433 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -35,10 +35,10 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=1.0.2,<1.1.0
-    blis>=0.4.0,<0.8.0
+    blis>=0.9.0,<0.10.0
 install_requires =
     # Explosion-provided dependencies
-    blis>=0.4.0,<0.8.0
+    blis>=0.9.0,<0.10.0
     murmurhash>=1.0.2,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
diff --git a/setup.py b/setup.py
index 3ce787cdc..4e5d056cc 100644
--- a/setup.py
+++ b/setup.py
@@ -17,6 +17,7 @@
 
 PACKAGES = find_packages()
 MOD_NAMES = [
+    "thinc.backends.cblas",
     "thinc.backends.linalg",
     "thinc.backends.numpy_ops",
     "thinc.extra.search",
diff --git a/thinc/backends/cblas.pxd b/thinc/backends/cblas.pxd
new file mode 100644
index 000000000..fcc867b68
--- /dev/null
+++ b/thinc/backends/cblas.pxd
@@ -0,0 +1,24 @@
+from libcpp.memory cimport shared_ptr
+
+
+ctypedef void (*sgemm_ptr)(bint transA, bint transB, int M, int N, int K,
+                           float alpha, const float* A, int lda, const float *B,
+                           int ldb, float beta, float* C, int ldc) nogil
+
+
+ctypedef void (*saxpy_ptr)(int N, float alpha, const float* X, int incX,
+                           float *Y, int incY) nogil
+
+
+# Forward-declaration of the BlasFuncs struct. This struct must be opaque, so
+# that consumers of the CBlas class cannot become dependent on its size or
+# ordering.
+cdef struct BlasFuncs
+
+
+cdef class CBlas:
+    cdef shared_ptr[BlasFuncs] ptr
+    cdef saxpy_ptr saxpy(self) nogil
+    cdef sgemm_ptr sgemm(self) nogil
+    cdef void set_saxpy(self, saxpy_ptr saxpy) nogil
+    cdef void set_sgemm(self, sgemm_ptr sgemm) nogil
diff --git a/thinc/backends/cblas.pyx b/thinc/backends/cblas.pyx
new file mode 100644
index 000000000..db18b83eb
--- /dev/null
+++ b/thinc/backends/cblas.pyx
@@ -0,0 +1,32 @@
+cimport blis.cy
+from cython.operator cimport dereference as deref
+from libcpp.memory cimport make_shared
+
+
+cdef struct BlasFuncs:
+    saxpy_ptr saxpy
+    sgemm_ptr sgemm
+
+
+cdef class CBlas:
+    __slots__ = []
+
+    def __init__(self):
+        """Construct a CBlas instance set to use BLIS implementations of the
+           supported BLAS functions."""
+        cdef BlasFuncs funcs
+        funcs.saxpy = blis.cy.saxpy
+        funcs.sgemm = blis.cy.sgemm
+        self.ptr = make_shared[BlasFuncs](funcs)
+
+    cdef saxpy_ptr saxpy(self) nogil:
+        return deref(self.ptr).saxpy
+
+    cdef sgemm_ptr sgemm(self) nogil:
+        return deref(self.ptr).sgemm
+
+    cdef void set_saxpy(self, saxpy_ptr saxpy) nogil:
+        deref(self.ptr).saxpy = saxpy
+
+    cdef void set_sgemm(self, sgemm_ptr sgemm) nogil:
+        deref(self.ptr).sgemm = sgemm
diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx
index 9fc6f19e2..2b618bdd4 100644
--- a/thinc/backends/numpy_ops.pyx
+++ b/thinc/backends/numpy_ops.pyx
@@ -20,6 +20,7 @@ cimport blis.cy
 from .. import registry
 from ..util import copy_array, get_array_module
 from ..types import DeviceTypes, DTypes, Shape, ArrayXd
+from .cblas cimport CBlas
 from .linalg cimport VecVec, Vec
 from .ops import Ops
 
@@ -30,6 +31,9 @@ except ImportError:
     has_blis = False
 
 
+cblas = CBlas()
+
+
 ctypedef float weight_t
 
 
@@ -82,6 +86,9 @@ class NumpyOps(Ops):
         else:
             return self.xp.empty(shape, dtype=dtype)
 
+    def cblas(self) -> CBlas:
+        return cblas
+
     def gemm(self, np.ndarray x, np.ndarray y, *, np.ndarray out=None, trans1=False, trans2=False):
         if x.ndim != 2:
             raise ValueError(f"Provided 'x' array should be 2-dimensional, but found {x.ndim} dimension(s).")
diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index 315b0b0bf..4179b4842 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -12,6 +12,7 @@
 from ..types import DeviceTypes, Generator, Padded, Batchable, SizedGenerator
 from ..util import get_array_module, is_xp_array, to_numpy
 
+from .cblas import CBlas
 
 ArrayT = TypeVar("ArrayT", bound=ArrayXd)
 FloatsT = TypeVar("FloatsT", bound=_Floats)
@@ -31,6 +32,11 @@ def __init__(
         self.device_type = device_type
         self.device_id = device_id
 
+    def cblas(self) -> CBlas:
+        """Return C BLAS function table."""
+        err = f"{type(self).__name__} does not provide C BLAS functions"
+        raise NotImplementedError(err)
+
     def to_numpy(self, data, *, byte_order=None):  # pragma: no cover
         if isinstance(data, numpy.ndarray):
             if byte_order:
diff --git a/website/docs/api-backends.md b/website/docs/api-backends.md
index 7f5c1e8d7..e1d90a655 100644
--- a/website/docs/api-backends.md
+++ b/website/docs/api-backends.md
@@ -382,6 +382,26 @@ the inputs and outputs.
 | `zeros`        | <tt>bool</tt>    | Fill the array with zeros (default: `True`). |
 | **RETURNS**    | <tt>ArrayXd</tt> | An array of the correct shape and data type. |
 
+### Ops.cblas {#cblas tag="method"}
+
+<inline-list>
+
+- **default:** <i name="no"></i>
+- **numpy:** <i name="yes"></i>
+- **cupy:** <i name="no"></i>
+
+</inline-list>
+
+Get a table of C BLAS functions usable in Cython `cdef nogil` functions. This
+method does not take any arguments.
+
+<infobox variant="warning">
+
+This method is only supported by `NumpyOps`. A `NotImplementedError` exception
+is raised when calling this method on `Ops` or `CupyOps`.
+
+</infobox>
+
 ### Ops.to_numpy {#to_numpy tag="method"}
 
 <inline-list>

From 40a5a0dea6dab9a8fc3f5f2dddada4d32223229f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 12 May 2022 09:36:46 +0200
Subject: [PATCH 03/78] Fix a unit test in the PyTorch wrapper (#663)

* Fix a unit test in the PyTorch wrapper

This test checked whether the allocator was set to the PyTorch allocator
when the PyTorch shim is used. However, this is not the case when
PyTorch is installed, but CuPy isn't, so the test would fail. Since this
test relies on CuPy, disable it when CuPy is not available.

* Fix merge fallout
---
 thinc/tests/layers/test_pytorch_wrapper.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/thinc/tests/layers/test_pytorch_wrapper.py b/thinc/tests/layers/test_pytorch_wrapper.py
index 1b37c29c9..e6f4edfb4 100644
--- a/thinc/tests/layers/test_pytorch_wrapper.py
+++ b/thinc/tests/layers/test_pytorch_wrapper.py
@@ -4,6 +4,7 @@
 from thinc.backends import context_pools
 from thinc.shims.pytorch_grad_scaler import PyTorchGradScaler
 from thinc.compat import has_torch, has_torch_amp, has_torch_gpu
+from thinc.compat import has_cupy
 import numpy
 import pytest
 
@@ -63,7 +64,7 @@ def test_pytorch_wrapper(nN, nI, nO):
     assert isinstance(model.predict(X), numpy.ndarray)
 
 
-@pytest.mark.skipif(not has_torch_gpu, reason="needs PyTorch with CUDA-capable GPU")
+@pytest.mark.skipif(not has_cupy or not has_torch_gpu, reason="needs PyTorch with CUDA-capable GPU")
 @pytest.mark.parametrize("nN,nI,nO", [(2, 3, 4)])
 @pytest.mark.parametrize("mixed_precision", TORCH_MIXED_PRECISION)
 def test_pytorch_wrapper_thinc_input(nN, nI, nO, mixed_precision):

From abbe0ffa8158b15c88d128d5a17c86fdadffe6dd Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Tue, 17 May 2022 11:53:19 +0200
Subject: [PATCH 04/78] `CupyOps`: Simplify `asarray` (#661)

* `CupyOps`: Simplify `asarray`

* Remove `cast_array` flag and use `astype` unconditionally

* Revert unconditional call to `astype`

* Remove no-op
---
 thinc/backends/cupy_ops.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py
index ec70cf72e..9f420fcd6 100644
--- a/thinc/backends/cupy_ops.py
+++ b/thinc/backends/cupy_ops.py
@@ -59,29 +59,20 @@ def gemm(self, x, y, out=None, trans1=False, trans2=False):
             return out
 
     def asarray(self, data, dtype=None):
-        # This is sort of frustrating, but we can't easily otherwise pass
-        # forward "unset".
-        dtype = {"dtype": dtype} if dtype is not None else {}
-
         # We'll try to perform a zero-copy conversion if possible.
-        array = None
-        cast_array = False
         if is_cupy_array(data):
-            array = self.xp.asarray(data, **dtype)
+            array = data
         elif is_torch_gpu_array(data):
             array = torch2xp(data)
-            cast_array = True
         elif is_tensorflow_gpu_array(data):
             array = tensorflow2xp(data)
-            cast_array = True
         elif is_mxnet_gpu_array(data):
             array = mxnet2xp(data)
-            cast_array = True
         else:
-            array = self.xp.array(data, **dtype)
+            array = self.xp.array(data)
 
-        if cast_array and dtype != {}:
-            array = array.astype(dtype["dtype"])
+        if dtype is not None:
+            array = array.astype(dtype=dtype, copy=False)
 
         return array
 

From d2d791794e66eb3d6c5b2003fe479dc33a1e905c Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Tue, 17 May 2022 11:54:23 +0200
Subject: [PATCH 05/78] NumpyOps: Better type-casting in `asarray` (#656)

* `NumpyOps`: Better type-casting in `asarray`

* Simplify `dtype` check

* Update thinc/backends/numpy_ops.pyx

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Simplify casting further, avoid copies if possible

* Remove no-op

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 thinc/backends/numpy_ops.pyx | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx
index 2b618bdd4..1455aad1c 100644
--- a/thinc/backends/numpy_ops.pyx
+++ b/thinc/backends/numpy_ops.pyx
@@ -66,19 +66,20 @@ class NumpyOps(Ops):
 
     def asarray(self, data, dtype=None):
         if isinstance(data, self.xp.ndarray):
-            if dtype is not None:
-                return self.xp.asarray(data, dtype=dtype)
-            else:
-                return self.xp.asarray(data)
+            array = data
         elif hasattr(data, 'numpy'):
             # Handles PyTorch Tensor
-            return data.numpy()
+            array = data.numpy()
         elif hasattr(data, "get"):
-            return data.get()
-        elif dtype is not None:
-            return self.xp.array(data, dtype=dtype)
+            array = data.get()
         else:
-            return self.xp.array(data)
+            array = self.xp.array(data)
+
+        if dtype is not None:
+            array = array.astype(dtype=dtype, copy=False)
+
+        return array
+
 
     def alloc(self, shape: Shape, *, dtype: Optional[DTypes] = "float32", zeros: bool = True) -> ArrayXd:
         if zeros:

From 90631684f8e4448fb5894cc8ab748a68939f2654 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 17 May 2022 14:53:48 +0200
Subject: [PATCH 06/78] Fix out-of-bounds writes in NumpyOps/CupyOps (#664)

* Fix out-of-bounds writes in NumpyOps/CupyOps

- Using `{CupyOps,NumpyOps}.adam` with incompatible shapes for weights,
  gradients, or moments resulted in out-of-bound writes.
- Using `NumpyOps.adam` with non-float32 arrays resulted filling arrays
  with incorrect data.

* Remove print debugging remnants

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* More print debugging remnants

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 thinc/backends/cupy_ops.py       | 10 ++++++++++
 thinc/backends/numpy_ops.pyx     |  9 +++++++--
 thinc/backends/ops.py            | 10 ++++++++++
 thinc/tests/backends/test_ops.py | 16 ++++++++++++++++
 4 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py
index 9f420fcd6..18e448dfd 100644
--- a/thinc/backends/cupy_ops.py
+++ b/thinc/backends/cupy_ops.py
@@ -281,6 +281,10 @@ def scatter_add(self, table, indices, values):
     def adam(
         self, weights, gradient, mom1, mom2, beta1, beta2, eps, learn_rate, mod_rate=1.0
     ):
+        _check_compatible_shape(weights, gradient)
+        _check_compatible_shape(weights, mom1)
+        _check_compatible_shape(weights, mom2)
+
         adam_kernel(
             gradient, learn_rate, 1 - beta1, 1 - beta2, eps, weights, mom1, mom2
         )
@@ -303,3 +307,9 @@ def position_encode(self, N, D, period=10000, out=None):
     )
 else:
     adam_kernel = None
+
+
+def _check_compatible_shape(u, v):
+    if u.shape != v.shape:
+        msg = f"arrays have incompatible shapes: {u.shape} and {v.shape}"
+        raise ValueError(msg)
diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx
index 1455aad1c..f30db6630 100644
--- a/thinc/backends/numpy_ops.pyx
+++ b/thinc/backends/numpy_ops.pyx
@@ -460,9 +460,14 @@ class NumpyOps(Ops):
 
     @cython.boundscheck(False)
     @cython.wraparound(False)
-    def adam(self, np.ndarray weights, np.ndarray gradient, np.ndarray mom1,
-             np.ndarray mom2, const float beta1, const float beta2, float eps,
+    def adam(self, np.ndarray[np.float32_t] weights, np.ndarray[np.float32_t] gradient,
+            np.ndarray[np.float32_t] mom1, np.ndarray[np.float32_t] mom2,
+            const float beta1, const float beta2, float eps,
             float learn_rate, float mod_rate=1.):
+        _check_compatible_shape(weights, gradient)
+        _check_compatible_shape(weights, mom1)
+        _check_compatible_shape(weights, mom2)
+
         _adam_momentum(<float*>gradient.data, <float*>mom1.data, <float*>mom2.data,
             weights.shape[0], beta1, beta2, eps, learn_rate)
         VecVec.add_i(<float*>weights.data,
diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index 4179b4842..e5a232d9b 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -1112,6 +1112,10 @@ def adam(
         learn_rate: float,
         mod_rate: float = 1.0,
     ) -> Tuple[Floats1d, Floats1d, Floats1d, Floats1d]:
+        _check_compatible_shape(weights, gradient)
+        _check_compatible_shape(weights, mom1)
+        _check_compatible_shape(weights, mom2)
+
         # Internals for optimizer
         mom1 *= beta1
         mom2 *= beta2
@@ -1570,3 +1574,9 @@ def gaussian_cdf(ops: Ops, X: FloatsType) -> FloatsType:
 def gaussian_pdf(ops: Ops, X: FloatsType) -> FloatsType:
     """Gaussian PDF for distribution with mean 0 and stdev 1."""
     return INV_SQRT_2PI * ops.xp.exp(-0.5 * X * X)
+
+
+def _check_compatible_shape(u: FloatsXd, v: FloatsXd):
+    if u.shape != v.shape:
+        msg = f"arrays have incompatible shapes: {u.shape} and {v.shape}"
+        raise ValueError(msg)
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index cdc319b91..e095142b1 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -127,6 +127,22 @@ def test_ops_consistency(op):
                     assert str(p1) == str(p2), attr
 
 
+@pytest.mark.parametrize("ops", ALL_OPS)
+def test_adam_incorrect_inputs(ops):
+    one = ops.xp.zeros(1, dtype="f")
+    two = ops.xp.zeros(2, dtype="f")
+
+    ops.adam(one, one, one, one, 0.0, 0.0, 0.0, 0.0)
+    with pytest.raises(ValueError):
+        ops.adam(two, one, one, one, 0.0, 0.0, 0.0, 0.0)
+    with pytest.raises(ValueError):
+        ops.adam(one, two, one, one, 0.0, 0.0, 0.0, 0.0)
+    with pytest.raises(ValueError):
+        ops.adam(one, one, two, one, 0.0, 0.0, 0.0, 0.0)
+    with pytest.raises(ValueError):
+        ops.adam(one, one, one, two, 0.0, 0.0, 0.0, 0.0)
+
+
 @pytest.mark.parametrize("ops", ALL_OPS)
 def test_alloc(ops):
     float_methods = (ops.alloc1f, ops.alloc2f, ops.alloc3f, ops.alloc4f)

From 6d84d00d7cff7beb27bc9a90ceed9e1aaf05182c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 18 May 2022 12:32:08 +0200
Subject: [PATCH 07/78] Set version to v8.1.0.dev0 (#666)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 0c63069de..5af1e2bfa 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.0.15"
+__version__ = "v8.1.0.dev0"
 __release__ = True

From a988fdac2e04ac66f2567ae5b3985ecffeb2899d Mon Sep 17 00:00:00 2001
From: Richard Hudson <richard@explosion.ai>
Date: Wed, 18 May 2022 19:22:45 +0200
Subject: [PATCH 08/78] Fix model.copy() bug where layer used more than once
 (#659)

* Fix model.copy() bug where layer used more than once

* Expand functionality to include shims

* Corrections after review

* Added default for Model._copy()
---
 thinc/model.py                  | 27 ++++++++++++++++++++--
 thinc/tests/model/test_model.py | 41 +++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+), 2 deletions(-)

diff --git a/thinc/model.py b/thinc/model.py
index 7b8560a8a..261858658 100644
--- a/thinc/model.py
+++ b/thinc/model.py
@@ -462,10 +462,33 @@ def copy(self: SelfT) -> SelfT:
         layers will also be deep-copied. The copy will receive a distinct `model.id`
         value.
         """
+        return self._copy()
+
+    def _copy(self: SelfT, seen: Optional[Dict[int, Union["Model", Shim]]] = None) -> SelfT:
+        if seen is None:
+            seen = {}
         params = {}
         for name in self.param_names:
             params[name] = self.get_param(name) if self.has_param(name) else None
 
+        copied_layers: List[Model] = []
+        for layer in self.layers:
+            if id(layer) in seen:
+                copied_layers.append(cast(Model, seen[id(layer)]))
+            else:
+                copied_layer = layer._copy(seen)
+                seen[id(layer)] = copied_layer
+                copied_layers.append(copied_layer)
+
+        copied_shims = []
+        for shim in self.shims:
+            if id(shim) in seen:
+                copied_shims.append(cast(Shim, seen[id(shim)]))
+            else:
+                copied_shim = shim.copy()
+                seen[id(shim)] = copied_shim
+                copied_shims.append(copied_shim)
+
         copied: Model[InT, OutT] = Model(
             self.name,
             self._func,
@@ -473,8 +496,8 @@ def copy(self: SelfT) -> SelfT:
             params=copy.deepcopy(params),
             dims=copy.deepcopy(self._dims),
             attrs=copy.deepcopy(self._attrs),
-            layers=[layer.copy() for layer in self.layers],
-            shims=[shim.copy() for shim in self.shims],
+            layers=copied_layers,
+            shims=copied_shims,
         )
         for name in self.grad_names:
             copied.set_grad(name, self.get_grad(name).copy())
diff --git a/thinc/tests/model/test_model.py b/thinc/tests/model/test_model.py
index 3db857987..9ec71fc93 100644
--- a/thinc/tests/model/test_model.py
+++ b/thinc/tests/model/test_model.py
@@ -615,3 +615,44 @@ def test_walk_bfs_post_order_fails():
     relu = Relu(5)
     with pytest.raises(ValueError, match="Invalid order"):
         relu.walk(order="dfs_post_order")
+
+
+def test_model_copy_with_loop():
+    class MyShim(Shim):
+        name = "testshim"
+
+        def to_bytes(self):
+            return test_replace_node_with_indirect_node_ref
+
+        def from_bytes(self, bytes):
+            pass
+
+    model_a = create_model("a")
+    working_shim = MyShim(None)
+    layer = Model(
+        "test",
+        lambda X: (X, lambda dY: dY),
+        dims={"nI": 5, "nO": 5},
+        params={"W": numpy.zeros((10,)), "b": None},
+        refs={"a": model_a, "b": None},
+        attrs={"foo": "bar"},
+        shims=[working_shim],
+        layers=[model_a, model_a],
+    )
+    layer2 = Model(
+        "test2",
+        lambda X: (X, lambda dY: dY),
+        dims={"nI": 5, "nO": 5},
+        params={"W": numpy.zeros((10,)), "b": None},
+        refs={"a": model_a, "b": None},
+        attrs={"foo": "bar"},
+        shims=[working_shim],
+        layers=[model_a, model_a],
+    )
+    relu = Relu(5)
+    model = chain(layer, relu, layer, layer2)
+    model2 = model.copy()
+    model.from_dict(model2.to_dict())
+    assert model2.name == "test>>relu>>test>>test2"
+    assert model2.layers[0] == model2.layers[2]
+    assert id(model2.layers[0].shims[0]) == id(model2.layers[3].shims[0])

From 21d9b862c741203febef99fe36d3000381a04429 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 19 May 2022 11:49:53 +0200
Subject: [PATCH 09/78] `conftest.py`: Handle exception caused by `pytest`
 options being added twice in CI builds (#670)

---
 thinc/tests/conftest.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/thinc/tests/conftest.py b/thinc/tests/conftest.py
index 239628b43..aac18a860 100644
--- a/thinc/tests/conftest.py
+++ b/thinc/tests/conftest.py
@@ -2,7 +2,12 @@
 
 
 def pytest_addoption(parser):
-    parser.addoption("--slow", action="store_true", help="include slow tests")
+    try:
+        parser.addoption("--slow", action="store_true", help="include slow tests")
+    # Options are already added, e.g. if conftest is copied in a build pipeline
+    # and runs twice
+    except ValueError:
+        pass
 
 
 def pytest_runtest_setup(item):

From 43268c23a715563afb281c8b45a137bedeb1cdcc Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 20 May 2022 15:33:52 +0200
Subject: [PATCH 10/78] Auto-format code with `black` + Pin `black` requirement
 (#673)

* Add `autoblack` GitHub action

* Fix command

* Add `black` to `requirements.txt`
---
 .github/workflows/autoblack.yml | 44 +++++++++++++++++++++++++++++++++
 requirements.txt                |  1 +
 2 files changed, 45 insertions(+)
 create mode 100644 .github/workflows/autoblack.yml

diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml
new file mode 100644
index 000000000..4109acce7
--- /dev/null
+++ b/.github/workflows/autoblack.yml
@@ -0,0 +1,44 @@
+# GitHub Action that uses Black to reformat all Python code and submits a PR
+# in regular intervals. Inspired by: https://github.com/cclauss/autoblack
+
+name: autoblack
+on:
+  workflow_dispatch:  # allow manual trigger
+  schedule:
+    - cron: '0 8 * * 5'  # every Friday at 8am UTC
+
+jobs:
+  autoblack:
+    if: github.repository_owner == 'explosion'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+        with:
+            ref: ${{ github.head_ref }}
+      - uses: actions/setup-python@v2
+      - run: pip install black
+      - name: Auto-format code if needed
+        run: black thinc
+      # We can't run black --check here because that returns a non-zero excit
+      # code and makes GitHub think the action failed
+      - name: Check for modified files
+        id: git-check
+        run: echo ::set-output name=modified::$(if git diff-index --quiet HEAD --; then echo "false"; else echo "true"; fi)
+      - name: Create Pull Request
+        if: steps.git-check.outputs.modified == 'true'
+        uses: peter-evans/create-pull-request@v3
+        with:
+            title: Auto-format code with black
+            labels: meta
+            commit-message: Auto-format code with black
+            committer: GitHub <noreply@github.com>
+            author: explosion-bot <explosion-bot@users.noreply.github.com>
+            body: _This PR is auto-generated._
+            branch: autoblack
+            delete-branch: true
+            draft: false
+      - name: Check outputs
+        if: steps.git-check.outputs.modified == 'true'
+        run: |
+          echo "Pull Request Number - ${{ steps.cpr.outputs.pull-request-number }}"
+          echo "Pull Request URL - ${{ steps.cpr.outputs.pull-request-url }}"
diff --git a/requirements.txt b/requirements.txt
index 9e57d60f4..d532e0f9f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -33,3 +33,4 @@ nbconvert>=5.6.1,<6.2.0
 nbformat>=5.0.4,<5.2.0
 # Test to_disk/from_disk against pathlib.Path subclasses
 pathy>=0.3.5
+black>=22.0,<23.0
\ No newline at end of file

From 46f3fd18e6aee2e1383be6874b4b95af247b5f54 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 20 May 2022 17:39:12 +0200
Subject: [PATCH 11/78] Add support for bot-invoked slow tests (#672)

---
 .github/workflows/explosionbot.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml
index 9c5be1366..8749667c5 100644
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@@ -23,5 +23,5 @@ jobs:
         env:
           INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
           INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
-          ENABLED_COMMANDS: "test_gpu"
+          ENABLED_COMMANDS: "test_gpu,test_slow"
           ALLOWED_TEAMS: "spacy-maintainers"
\ No newline at end of file

From 07a7dcf5e5800adb20e3f5471bb03e0a19b698f2 Mon Sep 17 00:00:00 2001
From: shadeMe <shadeMe@users.noreply.github.com>
Date: Mon, 23 May 2022 14:08:14 +0200
Subject: [PATCH 12/78] `Shim`: Fix potential data race when allocated on
 different threads

---
 thinc/shims/shim.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/thinc/shims/shim.py b/thinc/shims/shim.py
index 840589ff2..0c246e8d4 100644
--- a/thinc/shims/shim.py
+++ b/thinc/shims/shim.py
@@ -26,7 +26,8 @@ class Shim:  # pragma: no cover
     def __init__(self, model: Any, config=None, optimizer: Any = None):
         with Shim.global_id_lock:
             Shim.global_id += 1
-        self.id = Shim.global_id
+            self.id = Shim.global_id
+
         self.cfg = dict(config) if config is not None else {}
         self._model = model
         self._optimizer = optimizer

From ab165598c3a84094bea7f83e13690c8dbd17960b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 23 May 2022 19:18:04 +0200
Subject: [PATCH 13/78] Fix two warnings (#676)

- torch.nn.functional.sigmoid is deprecated in favor of torch.sigmoid.
- Clip cosh input in sechsq to avoid overflow.
---
 thinc/backends/ops.py            | 2 ++
 thinc/tests/backends/test_ops.py | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index e5a232d9b..2386e21cc 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -1001,6 +1001,8 @@ def erf(self, X: FloatsType) -> FloatsType:
         return out
 
     def sechsq(self, X: FloatsType) -> FloatsType:
+        # Avoid overflow in cosh. Clipping at |20| has an error of 1.7e-17.
+        X = self.xp.clip(X, -20.0, 20.0)
         return (1 / self.xp.cosh(X)) ** 2
 
     def gelu_approx(self, X: FloatsType, inplace: bool = False) -> FloatsType:
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index e095142b1..2c7cad2ca 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -62,7 +62,7 @@ def torch_hard_swish_mobilenet(x):
         return torch.nn.functional.hardswish(x)
 
     def torch_sigmoid(x):
-        return torch.nn.functional.sigmoid(x)
+        return torch.sigmoid(x)
 
     # https://github.com/huggingface/transformers/blob/master/src/transformers/activations.py#L37
     def torch_gelu_approx(x):

From 99573ff0faaf64d14caf8db3b302321f1cb72596 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 24 May 2022 09:55:09 +0200
Subject: [PATCH 14/78] Replace use of gpu_is_available with has_cupy_gpu
 (#675)

* Replace use of gpu_is_available with has_cupy_gpu

This PR is in preparation of better non-CUDA device support. Once we
support non-CUDA GPUs, there may be GPUs available that are not 'CuPy
GPUs'. In all places where we use `gpu_is_available` we actually mean:
is 'CuPy available with a CUDA GPU'? So, this PR replaces uses of
`gpu_is_available` to `has_cupy_gpu`. This allows us to use
`gpu_is_available` in the future to check if any GPU is available.

In addition to that, some code had expressions like

```
has_cupy and gpu_is_available()
```

This PR simplify such conditions to `has_cupy_gpu`, since `has_cupy_gpu`
implies that `has_cupy`.

* Remove unused import

* Improve error message when no CUDA GPU is found

* Fix another error message when no CUDA GPU is found
---
 thinc/tests/backends/test_ops.py              | 14 +++++---------
 thinc/tests/layers/test_tensorflow_wrapper.py |  6 +++---
 thinc/tests/model/test_model.py               |  5 ++---
 thinc/util.py                                 | 14 ++++++--------
 4 files changed, 16 insertions(+), 23 deletions(-)

diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index 2c7cad2ca..5d9f17524 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -8,8 +8,8 @@
 from packaging.version import Version
 from thinc.api import NumpyOps, CupyOps, Ops, get_ops
 from thinc.api import get_current_ops, use_ops
-from thinc.util import torch2xp, xp2torch, gpu_is_available
-from thinc.compat import has_torch, torch_version
+from thinc.util import torch2xp, xp2torch
+from thinc.compat import has_cupy_gpu, has_torch, torch_version
 from thinc.api import fix_random_seed
 from thinc.api import LSTM
 from thinc.types import Floats2d
@@ -26,7 +26,7 @@
 BLIS_OPS = NumpyOps(use_blis=True)
 CPU_OPS = [NUMPY_OPS, VANILLA_OPS]
 XP_OPS = [NUMPY_OPS]
-if CupyOps.xp is not None and gpu_is_available():
+if has_cupy_gpu:
     XP_OPS.append(CupyOps())
 ALL_OPS = XP_OPS + [VANILLA_OPS]
 
@@ -591,9 +591,7 @@ def test_backprop_seq2col_window_two(ops, dtype):
     ops.xp.testing.assert_allclose(seq, expected, atol=0.001, rtol=0.001)
 
 
-@pytest.mark.skipif(
-    CupyOps.xp is None or not gpu_is_available(), reason="needs GPU/CuPy"
-)
+@pytest.mark.skipif(not has_cupy_gpu, reason="needs GPU/CuPy")
 @pytest.mark.parametrize("nW", [1, 2])
 def test_large_seq2col_gpu_against_cpu(nW):
     cupy_ops = CupyOps()
@@ -615,9 +613,7 @@ def test_large_seq2col_gpu_against_cpu(nW):
     assert_allclose(cols, cols_gpu.get())
 
 
-@pytest.mark.skipif(
-    CupyOps.xp is None or not gpu_is_available(), reason="needs GPU/CuPy"
-)
+@pytest.mark.skipif(not has_cupy_gpu, reason="needs GPU/CuPy")
 @pytest.mark.parametrize("nW", [1, 2])
 def test_large_backprop_seq2col_gpu_against_cpu(nW):
     cupy_ops = CupyOps()
diff --git a/thinc/tests/layers/test_tensorflow_wrapper.py b/thinc/tests/layers/test_tensorflow_wrapper.py
index ff254ffc0..c1b85da3b 100644
--- a/thinc/tests/layers/test_tensorflow_wrapper.py
+++ b/thinc/tests/layers/test_tensorflow_wrapper.py
@@ -2,8 +2,8 @@
 import pytest
 from thinc.api import Adam, ArgsKwargs, Linear, Model, TensorFlowWrapper
 from thinc.api import get_current_ops, keras_subclass, tensorflow2xp, xp2tensorflow
-from thinc.util import gpu_is_available, to_categorical
-from thinc.compat import has_tensorflow
+from thinc.util import to_categorical
+from thinc.compat import has_cupy_gpu, has_tensorflow
 
 from ..util import check_input_converters, make_tempdir
 
@@ -359,7 +359,7 @@ def test_tensorflow_wrapper_to_cpu(tf_model):
 
 
 @pytest.mark.skipif(not has_tensorflow, reason="needs TensorFlow")
-@pytest.mark.skipif(not gpu_is_available(), reason="needs GPU/cupy")
+@pytest.mark.skipif(not has_cupy_gpu, reason="needs GPU/cupy")
 def test_tensorflow_wrapper_to_gpu(model, X):
     model.to_gpu(0)
 
diff --git a/thinc/tests/model/test_model.py b/thinc/tests/model/test_model.py
index 9ec71fc93..087e481c5 100644
--- a/thinc/tests/model/test_model.py
+++ b/thinc/tests/model/test_model.py
@@ -6,8 +6,7 @@
 from thinc.api import Shim, Softmax, chain, change_attr_values
 from thinc.api import concatenate, set_dropout_rate
 from thinc.api import use_ops, with_debug, wrap_model_recursive
-from thinc.util import gpu_is_available
-from thinc.compat import has_cupy
+from thinc.compat import has_cupy_gpu
 import numpy
 
 from ..util import make_tempdir
@@ -410,7 +409,7 @@ def test_model_gpu():
     import ml_datasets
 
     ops = "cpu"
-    if has_cupy and gpu_is_available():
+    if has_cupy_gpu:
         ops = "cupy"
 
     with use_ops(ops):
diff --git a/thinc/util.py b/thinc/util.py
index 16f0f0196..cdfdf4c1e 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -44,7 +44,7 @@ def fix_random_seed(seed: int = 0) -> None:  # pragma: no cover
     numpy.random.seed(seed)
     if has_torch:
         torch.manual_seed(seed)
-    if has_cupy and gpu_is_available():
+    if has_cupy_gpu:
         cupy.random.seed(seed)
         if has_torch and has_torch_gpu:
             torch.cuda.manual_seed_all(seed)
@@ -125,8 +125,8 @@ def to_numpy(data):  # pragma: no cover
 
 def set_active_gpu(gpu_id: int) -> "cupy.cuda.Device":  # pragma: no cover
     """Set the current GPU device for cupy and torch (if available)."""
-    if not gpu_is_available():
-        raise ValueError("No GPU devices detected")
+    if not has_cupy_gpu:
+        raise ValueError("No CUDA GPU devices detected")
 
     device = cupy.cuda.device.Device(gpu_id)
     device.use()
@@ -151,9 +151,7 @@ def require_cpu() -> bool:  # pragma: no cover
 
 def prefer_gpu(gpu_id: int = 0) -> bool:  # pragma: no cover
     """Use GPU if it's available. Returns True if so, False otherwise."""
-    from .backends.cupy_ops import CupyOps
-
-    if not gpu_is_available():
+    if not has_cupy_gpu:
         return False
     else:
         require_gpu(gpu_id=gpu_id)
@@ -163,8 +161,8 @@ def prefer_gpu(gpu_id: int = 0) -> bool:  # pragma: no cover
 def require_gpu(gpu_id: int = 0) -> bool:  # pragma: no cover
     from .backends import set_current_ops, CupyOps
 
-    if not gpu_is_available():
-        raise ValueError("No GPU devices detected")
+    if not has_cupy_gpu:
+        raise ValueError("No CUDA GPU devices detected")
 
     set_current_ops(CupyOps())
     set_active_gpu(gpu_id)

From 81035bcd19f8c3bc7c9a093c7f37f76df66ea89c Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Tue, 24 May 2022 12:29:07 +0200
Subject: [PATCH 15/78] Fixes for slow tests (#671)

---
 thinc/compat.py                                  |  2 +-
 thinc/tests/layers/test_basic_tagger.py          |  3 ++-
 thinc/tests/layers/test_mnist.py                 |  8 ++++++++
 thinc/tests/regression/issue519/program.py       | 12 ++++--------
 thinc/tests/regression/issue519/test_issue519.py |  8 ++++++--
 5 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/thinc/compat.py b/thinc/compat.py
index 467698df1..c858a7199 100644
--- a/thinc/compat.py
+++ b/thinc/compat.py
@@ -38,7 +38,7 @@
         and not torch.cuda.amp.common.amp_definitely_not_available()
     )
 except ImportError:  # pragma: no cover
-    torch = None
+    torch = None  # type: ignore
     has_torch = False
     has_torch_gpu = False
     has_torch_amp = False
diff --git a/thinc/tests/layers/test_basic_tagger.py b/thinc/tests/layers/test_basic_tagger.py
index e9075fec2..3046c1b04 100644
--- a/thinc/tests/layers/test_basic_tagger.py
+++ b/thinc/tests/layers/test_basic_tagger.py
@@ -8,13 +8,14 @@
 def ancora():
     pytest.importorskip("ml_datasets")
     import ml_datasets
+
     return ml_datasets.ud_ancora_pos_tags()
 
 
 def create_embed_relu_relu_softmax(depth, width, vector_length):
     with Model.define_operators({">>": chain}):
         model = strings2arrays() >> with_array(
-            HashEmbed(width, vector_length)
+            HashEmbed(width, vector_length, column=0)
             >> expand_window(window_size=1)
             >> Relu(width, width * 3)
             >> Relu(width, width)
diff --git a/thinc/tests/layers/test_mnist.py b/thinc/tests/layers/test_mnist.py
index b67a7fd57..321de3a0f 100644
--- a/thinc/tests/layers/test_mnist.py
+++ b/thinc/tests/layers/test_mnist.py
@@ -1,6 +1,7 @@
 import pytest
 from thinc.api import Relu, Softmax, chain, clone, Adam
 from thinc.api import PyTorchWrapper, TensorFlowWrapper
+from thinc.api import get_current_ops
 from thinc.compat import has_torch, has_tensorflow
 
 
@@ -80,9 +81,14 @@ def test_small_end_to_end(width, nb_epoch, min_score, create_model, mnist):
     optimizer = Adam(0.001)
     losses = []
     scores = []
+    ops = get_current_ops()
+
     for i in range(nb_epoch):
         for X, Y in model.ops.multibatch(batch_size, train_X, train_Y, shuffle=True):
             Yh, backprop = model.begin_update(X)
+            # Ensure that the tensor is type-compatible with the current backend.
+            Yh = ops.asarray(Yh)
+
             backprop(Yh - Y)
             model.finish_update(optimizer)
             losses.append(((Yh - Y) ** 2).sum())
@@ -90,6 +96,8 @@ def test_small_end_to_end(width, nb_epoch, min_score, create_model, mnist):
         total = 0
         for X, Y in model.ops.multibatch(batch_size, dev_X, dev_Y):
             Yh = model.predict(X)
+            Yh = ops.asarray(Yh)
+
             correct += (Yh.argmax(axis=1) == Y.argmax(axis=1)).sum()
             total += Yh.shape[0]
         score = correct / total
diff --git a/thinc/tests/regression/issue519/program.py b/thinc/tests/regression/issue519/program.py
index 2ad28d88b..b3e6dc9ba 100644
--- a/thinc/tests/regression/issue519/program.py
+++ b/thinc/tests/regression/issue519/program.py
@@ -5,16 +5,12 @@
 n_hidden = 32
 dropout = 0.2
 
-model1 = chain(
-    Relu(nO=n_hidden, dropout=dropout),
-    Relu(nO=n_hidden, dropout=dropout),
-    Softmax()
+model1: Model[Floats2d, Floats2d] = chain(
+    Relu(nO=n_hidden, dropout=dropout), Relu(nO=n_hidden, dropout=dropout), Softmax()
 )
 
-model2 = chain(
-    Relu(nO=n_hidden, dropout=dropout),
-    Relu(nO=n_hidden, dropout=dropout),
-    Softmax()
+model2: Model[Floats2d, Floats2d] = chain(
+    Relu(nO=n_hidden, dropout=dropout), Relu(nO=n_hidden, dropout=dropout), Softmax()
 )
 
 model3: Model[Floats2d, Floats2d] = concatenate(*[model1, model2])
diff --git a/thinc/tests/regression/issue519/test_issue519.py b/thinc/tests/regression/issue519/test_issue519.py
index 4a26f80e0..02601f0d7 100644
--- a/thinc/tests/regression/issue519/test_issue519.py
+++ b/thinc/tests/regression/issue519/test_issue519.py
@@ -1,4 +1,6 @@
 import subprocess
+import sys
+
 try:
     import importlib.resources as importlib_resources
 except ImportError:
@@ -16,10 +18,12 @@ def test_issue519():
     This test can take up to 45 seconds, and is thus marked as slow.
     """
     # Determine the name of the parent module (which contains the test program)
-    parent_module_name = __name__[:__name__.rfind(".")]
+    parent_module_name = __name__[: __name__.rfind(".")]
 
     # Load test program that calls a Thinc API with variadic arguments
     program_text = importlib_resources.read_text(parent_module_name, "program.py")
 
     # Ask Mypy to type-check the loaded program text
-    subprocess.run(["mypy", "--command", program_text], check=True)
+    subprocess.run(
+        [sys.executable, "-m", "mypy", "--command", program_text], check=True
+    )

From 3eb5e52f2bf7e8483263f9c04a2f64e51ea2be35 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Tue, 24 May 2022 15:44:43 +0200
Subject: [PATCH 16/78] `test_uniqued`: Disable test timing for
 `test_uniqued_doesnt_change_result` (#678)

---
 thinc/tests/layers/test_uniqued.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/thinc/tests/layers/test_uniqued.py b/thinc/tests/layers/test_uniqued.py
index 41d98ca16..9cb207ca5 100644
--- a/thinc/tests/layers/test_uniqued.py
+++ b/thinc/tests/layers/test_uniqued.py
@@ -3,7 +3,7 @@
 from thinc.layers import Embed
 from thinc.layers.uniqued import uniqued
 from numpy.testing import assert_allclose
-from hypothesis import given
+from hypothesis import given, settings
 from hypothesis.strategies import integers, lists, composite
 
 ROWS = 10
@@ -46,6 +46,7 @@ def test_uniqued_calls_init():
 
 
 @given(X=lists_of_integers(lo=0, hi=ROWS - 1))
+@settings(deadline=None)
 def test_uniqued_doesnt_change_result(model, X):
     umodel = uniqued(model, column=model.attrs["column"]).initialize()
     Y, bp_Y = model(X, is_train=True)

From b36492deb5a3aa353e2cca739d4858f36d3e48a8 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Wed, 25 May 2022 15:46:07 +0200
Subject: [PATCH 17/78] `test_to_categorical`: Ensure that `label_smoothing <
 0.5` (#680)

* `test_to_categorical`: Ensure that `label_smoothing < 0.5`

* Use `exclude_max` instead of clamping to `0.49`
---
 thinc/tests/test_util.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/thinc/tests/test_util.py b/thinc/tests/test_util.py
index b51f23a19..715d381d5 100644
--- a/thinc/tests/test_util.py
+++ b/thinc/tests/test_util.py
@@ -46,7 +46,9 @@ def test_array_module_cpu_gpu_helpers():
     assert not is_numpy_array((1, 2))
 
 
-@given(label_smoothing=strategies.floats(min_value=0.0, max_value=0.5))
+@given(
+    label_smoothing=strategies.floats(min_value=0.0, max_value=0.5, exclude_max=True)
+)
 def test_to_categorical(label_smoothing):
     # Test without n_classes
     one_hot = to_categorical(numpy.asarray([1, 2], dtype="i"))

From 6472d0a332a6cd850f6b511567dbcb8c0968c39e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 27 May 2022 11:00:05 +0200
Subject: [PATCH 18/78] test_ops: do not lower precision in conversion to Torch
 tensor (#681)

* test_ops: do not lower precision in conversion to Torch tensor

float64 test values close to zero were rounded by conversion to a
float32 Torch tensor, resuling in mismatches between Thinc and Torch
gradients. This change prevents the loss in precision.

* test_ops: compare arrays on same device in Torch comparison

* test_maxout: compare arrays with same precision
---
 thinc/tests/backends/test_ops.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index 5d9f17524..c8cb6a9df 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -216,7 +216,7 @@ def test_seq2col_window_one_small(ops):
 @given(X=strategies.arrays_BOP())
 def test_maxout(ops, dtype, X):
     X = ops.asarray(X, dtype=dtype)
-    expected_best = X.max(axis=-1)
+    expected_best = X.max(axis=-1).astype(dtype)
     predicted_best, which = ops.maxout(X)
     assert predicted_best.dtype == dtype
     ops.xp.testing.assert_allclose(
@@ -1259,22 +1259,19 @@ def test_ngrams():
 def test_compare_activations_to_torch(ops, dtype, x, torch_func):
     import torch
 
-    def cast_torch(scalar: float):
-        return torch.tensor([scalar], requires_grad=True)
-
     func_name, pytorch_func = torch_func
     forward = getattr(ops, func_name)
     backward = getattr(ops, "backprop_" + func_name)
     # The tolerance of isclose is set to 1e-06 instead of
     # the default 1e-08 due to the GELU
     x_thinc = ops.asarray([x], dtype=dtype)
-    x_torch = cast_torch(x)
+    x_torch = xp2torch(x_thinc, requires_grad=True)
     y = pytorch_func(x_torch)
     y_thinc = forward(x_thinc)
     y.backward()
     assert x_thinc.dtype == y_thinc.dtype
     assert ops.xp.isclose(y_thinc, forward(x_thinc, inplace=True), atol=1e-06)
-    assert ops.xp.isclose(y_thinc, y.detach().numpy(), atol=1e-06)
+    assert ops.xp.isclose(y_thinc, y.detach(), atol=1e-06)
     x_thinc = ops.asarray([x], dtype=dtype)
     dY_thinc = ops.asarray([1.0], dtype=dtype)
     dY_thinc_inplace = dY_thinc.copy()

From c88e43c6f912da66d71e5d1e6298d4412d965066 Mon Sep 17 00:00:00 2001
From: shademe <shadeMe@users.noreply.github.com>
Date: Fri, 27 May 2022 11:23:16 +0200
Subject: [PATCH 19/78] Add `test_slow_gpu` explosion-bot command

---
 .github/workflows/explosionbot.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/explosionbot.yml b/.github/workflows/explosionbot.yml
index 8749667c5..b32b65052 100644
--- a/.github/workflows/explosionbot.yml
+++ b/.github/workflows/explosionbot.yml
@@ -23,5 +23,5 @@ jobs:
         env:
           INPUT_TOKEN: ${{ secrets.EXPLOSIONBOT_TOKEN }}
           INPUT_BK_TOKEN: ${{ secrets.BUILDKITE_SECRET }}
-          ENABLED_COMMANDS: "test_gpu,test_slow"
+          ENABLED_COMMANDS: "test_gpu,test_slow,test_slow_gpu"
           ALLOWED_TEAMS: "spacy-maintainers"
\ No newline at end of file

From 145d7828d9fb2b6fda742b62cff3f454437f15bc Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 27 May 2022 12:15:59 +0200
Subject: [PATCH 20/78] Auto-format code with black (#682)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 thinc/backends/ops.py                      | 20 ++++++++---------
 thinc/layers/gelu.py                       |  5 +++--
 thinc/layers/hard_swish.py                 |  5 +++--
 thinc/layers/hard_swish_mobilenet.py       |  9 ++++----
 thinc/layers/layernorm.py                  |  2 +-
 thinc/layers/swish.py                      |  5 +++--
 thinc/model.py                             |  4 +++-
 thinc/optimizers.py                        |  8 +++----
 thinc/tests/layers/test_combinators.py     |  5 +----
 thinc/tests/layers/test_pytorch_wrapper.py |  4 +++-
 thinc/tests/layers/test_reduce.py          |  2 ++
 thinc/tests/layers/test_with_transforms.py | 12 +++++-----
 thinc/tests/mypy/test_mypy.py              |  1 +
 thinc/tests/test_loss.py                   | 26 ++++++++++++++++------
 thinc/tests/test_serialize.py              |  4 ++--
 15 files changed, 66 insertions(+), 46 deletions(-)

diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index 2386e21cc..a7015d44d 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -229,56 +229,56 @@ def affine(self, X: Floats2d, W: Floats2d, b: Floats1d) -> Floats2d:
         Y += b
         return Y
 
-    @overload 
+    @overload
     def flatten(
         self,
         X: List[Floats2d],
         dtype: Optional[DTypes] = None,
         pad: int = 0,
         ndim_if_empty: int = 2,
-     ) -> Floats2d:
+    ) -> Floats2d:
         ...
 
-    @overload 
+    @overload
     def flatten(
         self,
         X: List[Ints1d],
         dtype: Optional[DTypes] = None,
         pad: int = 0,
         ndim_if_empty: int = 2,
-     ) -> Ints1d:
+    ) -> Ints1d:
         ...
 
-    @overload 
+    @overload
     def flatten(
         self,
         X: List2d,
         dtype: Optional[DTypes] = None,
         pad: int = 0,
         ndim_if_empty: int = 2,
-     ) -> Array2d:
+    ) -> Array2d:
         ...
 
     # further specific typed signatures can be added as necessary
 
-    @overload 
+    @overload
     def flatten(
         self,
         X: ListXd,
         dtype: Optional[DTypes] = None,
         pad: int = 0,
         ndim_if_empty: int = 2,
-     ) -> ArrayXd:
+    ) -> ArrayXd:
         ...
 
-    @overload 
+    @overload
     def flatten(
         self,
         X: Sequence[ArrayXd],
         dtype: Optional[DTypes] = None,
         pad: int = 0,
         ndim_if_empty: int = 2,
-     ) -> ArrayXd:
+    ) -> ArrayXd:
         ...
 
     def flatten(
diff --git a/thinc/layers/gelu.py b/thinc/layers/gelu.py
index d49ac77a9..cdb0fb6ee 100644
--- a/thinc/layers/gelu.py
+++ b/thinc/layers/gelu.py
@@ -34,8 +34,9 @@ def Gelu(
     return model
 
 
-def forward(model: Model[Floats2d, Floats2d],
-            X: Floats2d, is_train: bool) -> Tuple[Floats2d, Callable]:
+def forward(
+    model: Model[Floats2d, Floats2d], X: Floats2d, is_train: bool
+) -> Tuple[Floats2d, Callable]:
     W = cast(Floats2d, model.get_param("W"))
     b = cast(Floats1d, model.get_param("b"))
     Y_preact = model.ops.affine(X, W, b)
diff --git a/thinc/layers/hard_swish.py b/thinc/layers/hard_swish.py
index 81b1ad8dd..0478fd270 100644
--- a/thinc/layers/hard_swish.py
+++ b/thinc/layers/hard_swish.py
@@ -34,8 +34,9 @@ def HardSwish(
     return model
 
 
-def forward(model: Model[Floats2d, Floats2d],
-            X: Floats2d, is_train: bool) -> Tuple[Floats2d, Callable]:
+def forward(
+    model: Model[Floats2d, Floats2d], X: Floats2d, is_train: bool
+) -> Tuple[Floats2d, Callable]:
     W = cast(Floats2d, model.get_param("W"))
     b = cast(Floats1d, model.get_param("b"))
     Y_preact = model.ops.affine(X, W, b)
diff --git a/thinc/layers/hard_swish_mobilenet.py b/thinc/layers/hard_swish_mobilenet.py
index 38004c848..6a5dce388 100644
--- a/thinc/layers/hard_swish_mobilenet.py
+++ b/thinc/layers/hard_swish_mobilenet.py
@@ -34,17 +34,16 @@ def HardSwishMobilenet(
     return model
 
 
-def forward(model: Model[Floats2d, Floats2d],
-            X: Floats2d, is_train: bool) -> Tuple[Floats2d, Callable]:
+def forward(
+    model: Model[Floats2d, Floats2d], X: Floats2d, is_train: bool
+) -> Tuple[Floats2d, Callable]:
     W = cast(Floats2d, model.get_param("W"))
     b = cast(Floats1d, model.get_param("b"))
     Y_preact = model.ops.affine(X, W, b)
     Y = model.ops.hard_swish_mobilenet(Y_preact)
 
     def backprop(dY: Floats2d) -> Floats2d:
-        dY = model.ops.backprop_hard_swish_mobilenet(dY,
-                                                     Y_preact,
-                                                     inplace=False)
+        dY = model.ops.backprop_hard_swish_mobilenet(dY, Y_preact, inplace=False)
         model.inc_grad("b", dY.sum(axis=0))
         model.inc_grad("W", model.ops.gemm(dY, X, trans1=True))
         return model.ops.gemm(dY, W)
diff --git a/thinc/layers/layernorm.py b/thinc/layers/layernorm.py
index cf22015ed..684489c54 100644
--- a/thinc/layers/layernorm.py
+++ b/thinc/layers/layernorm.py
@@ -17,7 +17,7 @@ def LayerNorm(nI: Optional[int] = None) -> Model[InT, InT]:
         forward,
         init=init,
         dims={"nI": nI, "nO": nI},
-        params={"G": None, "b": None}
+        params={"G": None, "b": None},
     )
 
 
diff --git a/thinc/layers/swish.py b/thinc/layers/swish.py
index ea5444b49..a05a0dc72 100644
--- a/thinc/layers/swish.py
+++ b/thinc/layers/swish.py
@@ -34,8 +34,9 @@ def Swish(
     return model
 
 
-def forward(model: Model[Floats2d, Floats2d],
-            X: Floats2d, is_train: bool) -> Tuple[Floats2d, Callable]:
+def forward(
+    model: Model[Floats2d, Floats2d], X: Floats2d, is_train: bool
+) -> Tuple[Floats2d, Callable]:
     W = cast(Floats2d, model.get_param("W"))
     b = cast(Floats1d, model.get_param("b"))
     Y_preact = model.ops.affine(X, W, b)
diff --git a/thinc/model.py b/thinc/model.py
index 261858658..08366523e 100644
--- a/thinc/model.py
+++ b/thinc/model.py
@@ -464,7 +464,9 @@ def copy(self: SelfT) -> SelfT:
         """
         return self._copy()
 
-    def _copy(self: SelfT, seen: Optional[Dict[int, Union["Model", Shim]]] = None) -> SelfT:
+    def _copy(
+        self: SelfT, seen: Optional[Dict[int, Union["Model", Shim]]] = None
+    ) -> SelfT:
         if seen is None:
             seen = {}
         params = {}
diff --git a/thinc/optimizers.py b/thinc/optimizers.py
index c8e38e84b..f34cd2ff8 100644
--- a/thinc/optimizers.py
+++ b/thinc/optimizers.py
@@ -279,7 +279,7 @@ def _radam(self, ops, weights, grad, lr_scale, key, nr_upd):
 
         # exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
         exp_avg_sq *= beta2
-        exp_avg_sq += (1 - beta2) * (gradient_1D ** 2)
+        exp_avg_sq += (1 - beta2) * (gradient_1D**2)
         # exp_avg.mul_(beta1).add_(1 - beta1, grad)
         exp_avg *= beta1
         exp_avg += (1 - beta1) * gradient_1D
@@ -338,9 +338,9 @@ def _adam(self, ops, weights, gradient, lr_scale, key, nr_upd):
         mom2 = self.mom2[key]
         b1 = self.b1
         b2 = self.b2
-        fix1 = 1.0 - (b1 ** nr_upd)
-        fix2 = 1.0 - (b2 ** nr_upd)
-        lr = self.learn_rate * fix2 ** 0.5 / fix1
+        fix1 = 1.0 - (b1**nr_upd)
+        fix2 = 1.0 - (b2**nr_upd)
+        lr = self.learn_rate * fix2**0.5 / fix1
         eps = self.eps
         # needs to be 1D going into the adam function
         weights_1D, gradient_1D, mom1, mom2 = ops.adam(
diff --git a/thinc/tests/layers/test_combinators.py b/thinc/tests/layers/test_combinators.py
index ed9a2992a..ea5583108 100644
--- a/thinc/tests/layers/test_combinators.py
+++ b/thinc/tests/layers/test_combinators.py
@@ -271,10 +271,7 @@ def test_concatenate():
 def test_map_list():
     nI = 4
     nO = 9
-    Xs = [
-        numpy.zeros((6, nI), dtype="f"),
-        numpy.ones((3, nI), dtype="f")
-    ]
+    Xs = [numpy.zeros((6, nI), dtype="f"), numpy.ones((3, nI), dtype="f")]
     Y_shapes = [(x.shape[0], nO) for x in Xs]
     model = map_list(Linear())
     model.initialize(X=Xs, Y=[numpy.zeros(shape, dtype="f") for shape in Y_shapes])
diff --git a/thinc/tests/layers/test_pytorch_wrapper.py b/thinc/tests/layers/test_pytorch_wrapper.py
index e6f4edfb4..fc4396370 100644
--- a/thinc/tests/layers/test_pytorch_wrapper.py
+++ b/thinc/tests/layers/test_pytorch_wrapper.py
@@ -64,7 +64,9 @@ def test_pytorch_wrapper(nN, nI, nO):
     assert isinstance(model.predict(X), numpy.ndarray)
 
 
-@pytest.mark.skipif(not has_cupy or not has_torch_gpu, reason="needs PyTorch with CUDA-capable GPU")
+@pytest.mark.skipif(
+    not has_cupy or not has_torch_gpu, reason="needs PyTorch with CUDA-capable GPU"
+)
 @pytest.mark.parametrize("nN,nI,nO", [(2, 3, 4)])
 @pytest.mark.parametrize("mixed_precision", TORCH_MIXED_PRECISION)
 def test_pytorch_wrapper_thinc_input(nN, nI, nO, mixed_precision):
diff --git a/thinc/tests/layers/test_reduce.py b/thinc/tests/layers/test_reduce.py
index ba829f779..d26065c4a 100644
--- a/thinc/tests/layers/test_reduce.py
+++ b/thinc/tests/layers/test_reduce.py
@@ -92,6 +92,7 @@ def test_reduce_mean(Xs):
     dX = backprop(Y)
     assert dX.dataXd.shape == X.dataXd.shape
 
+
 def test_reduce_sum(Xs):
     model = reduce_sum()
     lengths = model.ops.asarray([x.shape[0] for x in Xs], dtype="i")
@@ -107,6 +108,7 @@ def test_reduce_sum(Xs):
     dX = backprop(Y)
     assert dX.dataXd.shape == X.dataXd.shape
 
+
 def test_size_mismatch(Xs):
     for reduce in [reduce_first, reduce_last, reduce_max, reduce_mean, reduce_sum]:
         model = reduce()
diff --git a/thinc/tests/layers/test_with_transforms.py b/thinc/tests/layers/test_with_transforms.py
index a01e20567..c23db1463 100644
--- a/thinc/tests/layers/test_with_transforms.py
+++ b/thinc/tests/layers/test_with_transforms.py
@@ -26,8 +26,8 @@ def list_input(shapes):
     for i, x in enumerate(data):
         # Give values that make it easy to see where rows or columns mismatch.
         x += i * 100
-        x += numpy.arange(x.shape[0]).reshape((-1, 1)) * 10 
-        x += numpy.arange(x.shape[1]).reshape((1, -1)) 
+        x += numpy.arange(x.shape[0]).reshape((-1, 1)) * 10
+        x += numpy.arange(x.shape[1]).reshape((1, -1))
     return data
 
 
@@ -68,8 +68,10 @@ def noop_models():
         with_array(noop()),
         with_array2d(noop()),
         with_list(noop()),
-        with_ragged(noop())
+        with_ragged(noop()),
     ]
+
+
 # As an example operation, lets just trim the last dimension. That
 # should catch stuff that confuses the input and output.
 
@@ -180,14 +182,14 @@ def test_noop_transforms(noop_models, ragged_input, padded_input, list_input):
     d_ragged = Ragged(ragged_input.data + 1, ragged_input.lengths)
     d_padded = padded_input.copy()
     d_padded.data += 1
-    d_list = [dx+1 for dx in list_input]
+    d_list = [dx + 1 for dx in list_input]
     for model in noop_models:
         print(model.name)
         check_transform_doesnt_change_noop_values(model, padded_input, d_padded)
         check_transform_doesnt_change_noop_values(model, list_input, d_list)
         check_transform_doesnt_change_noop_values(model, ragged_input, d_ragged)
 
-    
+
 def test_with_array_initialize(ragged_input, padded_input, list_input, array_input):
     for inputs in (ragged_input, padded_input, list_input, array_input):
         check_initialize(get_array_model(), inputs)
diff --git a/thinc/tests/mypy/test_mypy.py b/thinc/tests/mypy/test_mypy.py
index 287043578..e03d1c874 100644
--- a/thinc/tests/mypy/test_mypy.py
+++ b/thinc/tests/mypy/test_mypy.py
@@ -23,6 +23,7 @@ def test_mypy_results(
 ):
     pytest.importorskip("mypy")
     from mypy import api as mypy_api
+
     os.chdir(tmpdir)
     root_dir = Path(__file__).parent
     thinc_root_dir = Path(__file__).parent.parent.parent.parent
diff --git a/thinc/tests/test_loss.py b/thinc/tests/test_loss.py
index 710a88d61..75206d240 100644
--- a/thinc/tests/test_loss.py
+++ b/thinc/tests/test_loss.py
@@ -168,7 +168,9 @@ def test_sequence_categorical_crossentropy(guesses, labels, names):
     assert d_scores1[1][0] == pytest.approx(0.4, eps)
     assert d_scores1[1][1] == pytest.approx(-0.4, eps)
     # The normalization divides the difference (e.g. 0.4) by the number of seqs
-    d_scores = SequenceCategoricalCrossentropy(normalize=True, names=names).get_grad(guesses, labels)
+    d_scores = SequenceCategoricalCrossentropy(normalize=True, names=names).get_grad(
+        guesses, labels
+    )
     d_scores1 = d_scores[0]
     d_scores2 = d_scores[1]
 
@@ -189,7 +191,9 @@ def test_sequence_categorical_crossentropy(guesses, labels, names):
     assert d_scores2[0][0] == pytest.approx(0.1, eps)
     assert d_scores2[0][1] == pytest.approx(-0.35, eps)
 
-    loss = SequenceCategoricalCrossentropy(normalize=True, names=names).get_loss(guesses, labels)
+    loss = SequenceCategoricalCrossentropy(normalize=True, names=names).get_loss(
+        guesses, labels
+    )
     assert loss == pytest.approx(1.09, eps)
 
 
@@ -200,9 +204,9 @@ def test_sequence_categorical_crossentropy(guesses, labels, names):
     ],
 )
 def test_sequence_categorical_missing_negative(guesses, labels, names):
-    d_scores = SequenceCategoricalCrossentropy(normalize=False, names=names, neg_prefix="!", missing_value="").get_grad(
-        guesses, labels
-    )
+    d_scores = SequenceCategoricalCrossentropy(
+        normalize=False, names=names, neg_prefix="!", missing_value=""
+    ).get_grad(guesses, labels)
     d_scores0 = d_scores[0]
 
     # [0.1, 0.5, 0.6] should be A
@@ -292,8 +296,16 @@ def test_cosine_unmatched():
         ("SequenceCategoricalCrossentropy.v1", {}, ([scores0], [labels0])),
         ("CategoricalCrossentropy.v2", {"neg_prefix": "!"}, (scores0, labels0)),
         ("CategoricalCrossentropy.v3", {"neg_prefix": "!"}, (scores0, labels0)),
-        ("SequenceCategoricalCrossentropy.v2", {"neg_prefix": "!"}, ([scores0], [labels0])),
-        ("SequenceCategoricalCrossentropy.v3", {"neg_prefix": "!"}, ([scores0], [labels0])),
+        (
+            "SequenceCategoricalCrossentropy.v2",
+            {"neg_prefix": "!"},
+            ([scores0], [labels0]),
+        ),
+        (
+            "SequenceCategoricalCrossentropy.v3",
+            {"neg_prefix": "!"},
+            ([scores0], [labels0]),
+        ),
         ("L2Distance.v1", {}, (scores0, scores0)),
         (
             "CosineDistance.v1",
diff --git a/thinc/tests/test_serialize.py b/thinc/tests/test_serialize.py
index f3a937c34..b89fc2d94 100644
--- a/thinc/tests/test_serialize.py
+++ b/thinc/tests/test_serialize.py
@@ -55,7 +55,7 @@ def test_simple_model_roundtrip_bytes():
 
 
 def test_simple_model_roundtrip_bytes_length():
-    """ Ensure that serialization of non-initialized weight matrices goes fine """
+    """Ensure that serialization of non-initialized weight matrices goes fine"""
     model1 = Maxout(5, 10, nP=2)
     model2 = Maxout(5, 10, nP=2)
 
@@ -186,7 +186,7 @@ def test_simple_model_can_from_dict():
     assert model.can_from_dict(model_dict)
     # Test check without initialize
     assert Maxout(5, 10, nP=2).can_from_dict(model_dict)
-    # Test not-strict check 
+    # Test not-strict check
     assert not Maxout(10, 5, nP=2).can_from_dict(model_dict)
     assert Maxout(5, nP=2).can_from_dict(model_dict)
 

From 6cc4d9783c33030fdb701789d86a2d133c8b8d53 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= <me@danieldk.eu>
Date: Mon, 30 May 2022 10:55:58 +0200
Subject: [PATCH 21/78] Azure: pin protobuf to fix Tensorflow

---
 azure-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 301ab7341..08d5ed93c 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -82,7 +82,7 @@ jobs:
 
   - script: |
       pip install -r requirements.txt
-      pip install "tensorflow~=2.5.0"
+      pip install "protobuf~=3.20.0" "tensorflow~=2.5.0"
       pip install "mxnet; sys_platform != 'win32'"
       pip install "torch==1.9.0+cpu" -f https://download.pytorch.org/whl/torch_stable.html
       pip install ipykernel pydot graphviz

From abf7d31682cef9edbb803eea8371d116d905802a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 2 Jun 2022 09:10:46 +0200
Subject: [PATCH 22/78] Extend typing_extensions to <4.2.0 (#689)

---
 requirements.txt | 4 ++--
 setup.cfg        | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index d532e0f9f..5f35d8e91 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,7 @@ pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
 numpy>=1.15.0
 # Backports of modern Python features
 dataclasses>=0.6,<1.0; python_version < "3.7"
-typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
+typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"
 contextvars>=2.4,<3; python_version < "3.7"
 # Development dependencies
 cython>=0.25.0,<3.0
@@ -33,4 +33,4 @@ nbconvert>=5.6.1,<6.2.0
 nbformat>=5.0.4,<5.2.0
 # Test to_disk/from_disk against pathlib.Path subclasses
 pathy>=0.3.5
-black>=22.0,<23.0
\ No newline at end of file
+black>=22.0,<23.0
diff --git a/setup.cfg b/setup.cfg
index 5351aa433..adc3dac92 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -51,7 +51,7 @@ install_requires =
     pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
     # Backports of modern Python features
     dataclasses>=0.6,<1.0; python_version < "3.7"
-    typing_extensions>=3.7.4.1,<4.0.0.0; python_version < "3.8"
+    typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"
     contextvars>=2.4,<3; python_version < "3.7"
 
 [options.extras_require]

From 46334b55c03acae3f96ce8bf3647302256f2bdd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 7 Jun 2022 14:17:14 +0200
Subject: [PATCH 23/78] xp2{tensorflow,torch}: convert NumPy arrays using
 dlpack (#686)

* xp2{tensorflow,torch}: convert NumPy arrays using dlpack

Newer versions of NumPy can expose arrays as dlpack capsules. Use this
functionality (when supported) to speed up NumPy -> Torch/Tensorflow
array conversion.

* Fix up copy paste error
---
 thinc/util.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/thinc/util.py b/thinc/util.py
index cdfdf4c1e..6ace7e7ed 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -314,6 +314,8 @@ def xp2torch(
     if hasattr(xp_tensor, "toDlpack"):
         dlpack_tensor = xp_tensor.toDlpack()  # type: ignore
         torch_tensor = torch.utils.dlpack.from_dlpack(dlpack_tensor)
+    elif hasattr(xp_tensor, "__dlpack__"):
+        torch_tensor = torch.utils.dlpack.from_dlpack(xp_tensor)
     else:
         torch_tensor = torch.from_numpy(xp_tensor)
     if requires_grad:
@@ -350,6 +352,8 @@ def xp2tensorflow(
     if hasattr(xp_tensor, "toDlpack"):
         dlpack_tensor = xp_tensor.toDlpack()  # type: ignore
         tf_tensor = tf.experimental.dlpack.from_dlpack(dlpack_tensor)
+    elif hasattr(xp_tensor, "__dlpack__"):
+        tf_tensor = tf.experimental.dlpack.from_dlpack(xp_tensor)
     else:
         tf_tensor = tf.convert_to_tensor(xp_tensor)
     if as_variable:

From 1c6e9f48e79a877ee27bbc17a5eb4fcdb62e6b14 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Wed, 8 Jun 2022 15:53:02 +0200
Subject: [PATCH 24/78] `test_model_gpu`: Use TF memory pool if available,
 feature-gate test (#688)

* `test_model_gpu`: Use TF memory pool if available, feature-gate test

* Fix typo

* `test_predict_extensive`: Disable test time monitoring

* Fix imports, use `has_cupy_gpu` for forward-compat

* `conftest`: Use `pytest_sessionstart` to enable TF GPU memory growth
---
 thinc/tests/conftest.py           | 18 ++++++++++++++++++
 thinc/tests/layers/test_linear.py |  1 +
 thinc/tests/model/test_model.py   |  7 ++-----
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/thinc/tests/conftest.py b/thinc/tests/conftest.py
index aac18a860..8c1e21f0a 100644
--- a/thinc/tests/conftest.py
+++ b/thinc/tests/conftest.py
@@ -1,6 +1,24 @@
 import pytest
 
 
+def pytest_sessionstart(session):
+    # If Tensorflow is installed, attempt to enable memory growth
+    # to prevent it from allocating all of the GPU's free memory
+    # to its internal memory pool(s).
+    try:
+        import tensorflow as tf
+
+        physical_devices = tf.config.list_physical_devices("GPU")
+        for device in physical_devices:
+            try:
+                tf.config.experimental.set_memory_growth(device, True)
+            except:
+                # Invalid device or cannot modify virtual devices once initialized.
+                print(f"failed to enable Tensorflow memory growth on {device}")
+    except ImportError:
+        pass
+
+
 def pytest_addoption(parser):
     try:
         parser.addoption("--slow", action="store_true", help="include slow tests")
diff --git a/thinc/tests/layers/test_linear.py b/thinc/tests/layers/test_linear.py
index ef00d77b6..2362b556b 100644
--- a/thinc/tests/layers/test_linear.py
+++ b/thinc/tests/layers/test_linear.py
@@ -87,6 +87,7 @@ def test_predict_small(W_b_input):
 
 
 @given(arrays_OI_O_BI(max_batch=20, max_out=30, max_in=30))
+@settings(deadline=None)
 def test_predict_extensive(W_b_input):
     W, b, input_ = W_b_input
     nr_out, nr_in = W.shape
diff --git a/thinc/tests/model/test_model.py b/thinc/tests/model/test_model.py
index 087e481c5..733b3329f 100644
--- a/thinc/tests/model/test_model.py
+++ b/thinc/tests/model/test_model.py
@@ -404,15 +404,12 @@ def get_model_id(id_list, index):
     assert len(list_of_ids) == len(list(set(list_of_ids)))
 
 
+@pytest.mark.skipif(not has_cupy_gpu, reason="needs CuPy GPU")
 def test_model_gpu():
     pytest.importorskip("ml_datasets")
     import ml_datasets
 
-    ops = "cpu"
-    if has_cupy_gpu:
-        ops = "cupy"
-
-    with use_ops(ops):
+    with use_ops("cupy"):
         n_hidden = 32
         dropout = 0.2
         (train_X, train_Y), (dev_X, dev_Y) = ml_datasets.mnist()

From be65301a72447827a6bdf1c02c0c21e88b8cdcd4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 9 Jun 2022 16:04:29 +0200
Subject: [PATCH 25/78] Bump version to v8.1.0.dev1 (#694)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 5af1e2bfa..4c2cc699f 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "v8.1.0.dev0"
+__version__ = "v8.1.0.dev1"
 __release__ = True

From 8d9405f804eb732de34ca1eadcec7dcef285c84f Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Tue, 14 Jun 2022 15:09:36 +0200
Subject: [PATCH 26/78] `NumpyOps`: Do not use global for `CBlas` (#697)

---
 thinc/backends/numpy_ops.pyx | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx
index f30db6630..0938ffedf 100644
--- a/thinc/backends/numpy_ops.pyx
+++ b/thinc/backends/numpy_ops.pyx
@@ -31,9 +31,6 @@ except ImportError:
     has_blis = False
 
 
-cblas = CBlas()
-
-
 ctypedef float weight_t
 
 
@@ -88,7 +85,7 @@ class NumpyOps(Ops):
             return self.xp.empty(shape, dtype=dtype)
 
     def cblas(self) -> CBlas:
-        return cblas
+        return CBlas()
 
     def gemm(self, np.ndarray x, np.ndarray y, *, np.ndarray out=None, trans1=False, trans2=False):
         if x.ndim != 2:

From 862a489333ceb4ba6c1c985df3b1e1d7d54b44de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Tue, 14 Jun 2022 15:35:37 +0200
Subject: [PATCH 27/78] Merge pytorch-device branch into master (#695)

* Remove use of `torch.set_default_tensor_type` (#674)

* Remove use of `torch.set_default_tensor_type`

This PR removes use of `torch.set_default_tensor_type`. There are
various reasons why we should probably move away from using this
function:

- Upstream will deprecate and remove it:
  https://github.com/pytorch/pytorch/issues/53124
- We cannot use this mechanism for other devices than CPU/CUDA, such as
  Metal Performance Shaders.
- It offers little flexibility in allocating Torch models on different
  devices.

This PR makes `PyTorchWrapper`/`PyTorchShim` flexible in terms of the
devices it can use. Both classes add a `device` argument to their
constructors that takes a `torch.device` instance. The shim ensures that
the model is on the given device. The wrapper ensures that input tensors
are on the correct device, by calling `xp2torch` with the new `device`
keyword argument.

Even though this approach offers more flexibility, as a default we want
to use the `cpu` device when `NumpyOps` is used and `cuda:N` when
CupyOps is used. In order to do so, this PR also adds a new function
`get_torch_default_device` that returns the correct device for the
currently active Ops. `PyTorchWrapper`/`PyTorchShim`/`xp2torch` use this
function when `None` is given as the device to fall back on this
default, mimicking the behavior from before this PR.

* Add some typing fixes

* Remove spurious cupy import

* Small fixes

- Use `torch.cuda.current_device()` to get the current PyTorch CUDA
  device.
- Do not use `torch_set_default_tensor_type` in `set_active_gpu`.

* Add `test_slow_gpu` explosion-bot command

* Auto-format code with black (#682)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>

* Azure: pin protobuf to fix Tensorflow

* Extend typing_extensions to <4.2.0 (#689)

* Add support for PyTorch Metal Performance Shaders (#685)

* Add `test_slow_gpu` explosion-bot command

* Auto-format code with black (#682)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>

* Add support for PyTorch Metal Performance Shaders

Nightly PyTorch versions add support for Metal Performance Shaders
(MPS). Metal is a low-level graphics API for Apple platforms that also
supports compute kernels (shaders). MPS is a framework of
highly-optimized compute and graphics kernels, including kernels for
neural networks. MPS is supported on both Apple Silicon, such as the M1
family of SoC, as well as a range of AMD GPUs used in Macs.

Since devices are handled in Thinc through a specific `Ops`
implementation (e.g. `CupyOps` == CUDA GPUs), this change introduces the
`MPSOps` class. This class is a subclass of `NumpyOps` or
`AppleOps` (when available). `MPSOps` does not override any methods, but
is used to signal to relevant code paths (e.g. `xp2torch`) that Torch
tensors should be placed on the MPS device.

The mapping in the previously introduced `get_torch_default_device`
function is updated to:

- `NumpyOps` -> `cpu`
- `CupyOps` -> `cuda:N`, where N is the selected CUDA device.
- `MPSOps` -> `mps`

to ensure placement of Torch tensors on the `mps` device when `MPSOps`
is active.

Finally, the following booleans have been added to or changed in
`compat`:

- `has_torch_mps` (new): PyTorch has MPS support
- `has_torch_mps_gpu` (new): PyTorch has MPS support and an
  MPS-capable GPU is available.
- `has_torch_cuda_gpu` (new): PyTorch has CUDA support and a
  CUDA-capable GPU is available.
- `has_torch_gpu` (changed): PyTorch has a GPU available (CUDA
  or MPS).

* Test PyTorch wrapper with all xp ops

* Azure: pin protobuf to fix Tensorflow

* Extend typing_extensions to <4.2.0 (#689)

* Fix type checking error

* Only back-off to NumpyOps on import error

We do not want to hide other issues while importing thinc_apple_ops.

* Remove unneeded `has_torch_mps` bool

* Add `has_gpu` bool and use it in `util`

* Replace another expression by has_gpu

* Set `has_torch_gpu` to `has_torch_cuda_gpu`

We need to decide whether we want to make the potentially breaking
change from `has_torch_cuda_gpu` to `has_torch_cuda_gpu or
has_torch_mps_gpu`. But since the latter is not needed for this PR,
remove the change.

* Update thinc/util.py

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: shademe <shadeMe@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

Co-authored-by: shademe <shadeMe@users.noreply.github.com>
Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 examples/transformers_tagger.py               |  9 +-
 thinc/api.py                                  |  2 +-
 thinc/backends/__init__.py                    |  9 +-
 thinc/backends/_cupy_allocators.py            |  7 +-
 thinc/backends/cupy_ops.py                    |  4 +-
 thinc/backends/mps_ops.py                     | 26 ++++++
 thinc/compat.py                               | 13 ++-
 thinc/layers/pytorchwrapper.py                | 30 ++++--
 thinc/shims/pytorch.py                        | 40 +++++---
 thinc/tests/layers/test_pytorch_wrapper.py    | 38 ++++++--
 thinc/tests/regression/test_issue564.py       |  4 +-
 thinc/tests/shims/test_pytorch_grad_scaler.py |  6 +-
 thinc/util.py                                 | 93 ++++++++++++-------
 13 files changed, 203 insertions(+), 78 deletions(-)
 create mode 100644 thinc/backends/mps_ops.py

diff --git a/examples/transformers_tagger.py b/examples/transformers_tagger.py
index 88052ba1f..058d5af24 100644
--- a/examples/transformers_tagger.py
+++ b/examples/transformers_tagger.py
@@ -132,7 +132,9 @@ def forward(
         return TokensPlus(**token_data), lambda d_tokens: []
 
     return Model(
-        "tokenizer", forward, attrs={"tokenizer": AutoTokenizer.from_pretrained(name)},
+        "tokenizer",
+        forward,
+        attrs={"tokenizer": AutoTokenizer.from_pretrained(name)},
     )
 
 
@@ -166,11 +168,14 @@ def convert_transformer_outputs(model, inputs_outputs, is_train):
 
     def backprop(d_tokvecs: List[Floats2d]) -> ArgsKwargs:
         # Restore entries for bos and eos markers.
+        shim = model.shims[0]
         row = model.ops.alloc2f(1, d_tokvecs[0].shape[1])
         d_tokvecs = [model.ops.xp.vstack((row, arr, row)) for arr in d_tokvecs]
         return ArgsKwargs(
             args=(torch_tokvecs,),
-            kwargs={"grad_tensors": xp2torch(model.ops.pad(d_tokvecs))},
+            kwargs={
+                "grad_tensors": xp2torch(model.ops.pad(d_tokvecs, device=shim.device))
+            },
         )
 
     return tokvecs, backprop
diff --git a/thinc/api.py b/thinc/api.py
index 204f39b4e..7c8702cd3 100644
--- a/thinc/api.py
+++ b/thinc/api.py
@@ -18,7 +18,7 @@
 from .util import torch2xp, xp2torch, tensorflow2xp, xp2tensorflow, mxnet2xp, xp2mxnet
 from .compat import has_cupy
 from .backends import get_ops, set_current_ops, get_current_ops, use_ops
-from .backends import Ops, CupyOps, NumpyOps, set_gpu_allocator
+from .backends import Ops, CupyOps, MPSOps, NumpyOps, set_gpu_allocator
 from .backends import use_pytorch_for_gpu_memory, use_tensorflow_for_gpu_memory
 
 from .layers import Dropout, Embed, expand_window, HashEmbed, LayerNorm, Linear
diff --git a/thinc/backends/__init__.py b/thinc/backends/__init__.py
index d1219796d..c21620126 100644
--- a/thinc/backends/__init__.py
+++ b/thinc/backends/__init__.py
@@ -7,10 +7,11 @@
 from .ops import Ops
 from .cupy_ops import CupyOps
 from .numpy_ops import NumpyOps
+from .mps_ops import MPSOps
 from ._cupy_allocators import cupy_tensorflow_allocator, cupy_pytorch_allocator
 from ._param_server import ParamServer
 from ..util import assert_tensorflow_installed, assert_pytorch_installed
-from ..util import is_cupy_array, set_torch_tensor_type_for_ops, require_cpu
+from ..util import get_torch_default_device, is_cupy_array, require_cpu
 from .. import registry
 from ..compat import cupy, has_cupy
 
@@ -48,6 +49,10 @@ def use_pytorch_for_gpu_memory() -> None:  # pragma: no cover
     (or vice versa), but do not currently have an implementation for it.
     """
     assert_pytorch_installed()
+
+    if get_torch_default_device().type != "cuda":
+        return
+
     pools = context_pools.get()
     if "pytorch" not in pools:
         pools["pytorch"] = cupy.cuda.MemoryPool(allocator=cupy_pytorch_allocator)
@@ -134,7 +139,6 @@ def set_current_ops(ops: Ops) -> None:
     """Change the current backend object."""
     context_ops.set(ops)
     _get_thread_state().ops = ops
-    set_torch_tensor_type_for_ops(ops)
 
 
 def contextvars_eq_thread_ops() -> bool:
@@ -170,6 +174,7 @@ def _create_thread_local(
     "ParamServer",
     "Ops",
     "CupyOps",
+    "MPSOps",
     "NumpyOps",
     "has_cupy",
 ]
diff --git a/thinc/backends/_cupy_allocators.py b/thinc/backends/_cupy_allocators.py
index a9f000c6c..f2b6faee9 100644
--- a/thinc/backends/_cupy_allocators.py
+++ b/thinc/backends/_cupy_allocators.py
@@ -1,7 +1,7 @@
 from typing import cast
 
 from ..types import ArrayXd
-from ..util import tensorflow2xp
+from ..util import get_torch_default_device, tensorflow2xp
 from ..compat import torch, cupy, tensorflow
 
 
@@ -23,6 +23,7 @@ def cupy_tensorflow_allocator(size_in_bytes: int):
 
 
 def cupy_pytorch_allocator(size_in_bytes: int):
+    device = get_torch_default_device()
     """Function that can be passed into cupy.cuda.set_allocator, to have cupy
     allocate memory via PyTorch. This is important when using the two libraries
     together, as otherwise OOM errors can occur when there's available memory
@@ -34,7 +35,9 @@ def cupy_pytorch_allocator(size_in_bytes: int):
     # creating a whole Tensor.
     # This turns out to be way faster than making FloatStorage? Maybe
     # a Python vs C++ thing I guess?
-    torch_tensor = torch.zeros((size_in_bytes // 4,), requires_grad=False)
+    torch_tensor = torch.zeros(
+        (size_in_bytes // 4,), requires_grad=False, device=device
+    )
     # cupy has a neat class to help us here. Otherwise it will try to free.
     # I think this is a private API? It's not in the types.
     address = torch_tensor.data_ptr()  # type: ignore
diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py
index 18e448dfd..e77d525ed 100644
--- a/thinc/backends/cupy_ops.py
+++ b/thinc/backends/cupy_ops.py
@@ -6,7 +6,7 @@
 from ..types import DeviceTypes
 from ..util import torch2xp, tensorflow2xp, mxnet2xp
 from ..util import is_cupy_array
-from ..util import is_torch_gpu_array, is_tensorflow_gpu_array, is_mxnet_gpu_array
+from ..util import is_torch_cuda_array, is_tensorflow_gpu_array, is_mxnet_gpu_array
 from ..compat import cupy, cupyx
 
 
@@ -62,7 +62,7 @@ def asarray(self, data, dtype=None):
         # We'll try to perform a zero-copy conversion if possible.
         if is_cupy_array(data):
             array = data
-        elif is_torch_gpu_array(data):
+        elif is_torch_cuda_array(data):
             array = torch2xp(data)
         elif is_tensorflow_gpu_array(data):
             array = tensorflow2xp(data)
diff --git a/thinc/backends/mps_ops.py b/thinc/backends/mps_ops.py
new file mode 100644
index 000000000..8ebbd4e4b
--- /dev/null
+++ b/thinc/backends/mps_ops.py
@@ -0,0 +1,26 @@
+from typing import TYPE_CHECKING
+import numpy
+
+from .. import registry
+from . import NumpyOps, Ops
+
+if TYPE_CHECKING:
+    # Type checking does not work with dynamic base classes, since MyPy cannot
+    # determine against which base class to check. So, always derive from Ops
+    # during type checking.
+    _Ops = Ops
+else:
+    try:
+        from thinc_apple_ops import AppleOps
+
+        _Ops = AppleOps
+    except ImportError:
+        _Ops = NumpyOps
+
+
+@registry.ops("MPSOps")
+class MPSOps(_Ops):
+    """Ops class for Metal Performance shaders."""
+
+    name = "mps"
+    xp = numpy
diff --git a/thinc/compat.py b/thinc/compat.py
index c858a7199..75bfe69e6 100644
--- a/thinc/compat.py
+++ b/thinc/compat.py
@@ -31,7 +31,13 @@
     import torch
 
     has_torch = True
-    has_torch_gpu = torch.cuda.device_count() != 0
+    has_torch_cuda_gpu = torch.cuda.device_count() != 0
+    has_torch_mps_gpu = (
+        hasattr(torch, "has_mps")
+        and torch.has_mps
+        and torch.backends.mps.is_available()
+    )
+    has_torch_gpu = has_torch_cuda_gpu
     torch_version = Version(str(torch.__version__))
     has_torch_amp = (
         torch_version >= Version("1.9.0")
@@ -40,7 +46,9 @@
 except ImportError:  # pragma: no cover
     torch = None  # type: ignore
     has_torch = False
+    has_torch_cuda_gpu = False
     has_torch_gpu = False
+    has_torch_mps_gpu = False
     has_torch_amp = False
     torch_version = Version("0.0.0")
 
@@ -68,3 +76,6 @@
     import h5py
 except ImportError:  # pragma: no cover
     h5py = None
+
+
+has_gpu = has_cupy_gpu or has_torch_mps_gpu
diff --git a/thinc/layers/pytorchwrapper.py b/thinc/layers/pytorchwrapper.py
index 8e05856bb..882132dcb 100644
--- a/thinc/layers/pytorchwrapper.py
+++ b/thinc/layers/pytorchwrapper.py
@@ -1,9 +1,10 @@
 from typing import Callable, Tuple, Optional, Any, cast
 
+from ..compat import torch
 from ..model import Model
 from ..shims import PyTorchGradScaler, PyTorchShim
 from ..config import registry
-from ..util import is_xp_array, is_torch_array
+from ..util import is_xp_array, is_torch_array, partial
 from ..util import xp2torch, torch2xp, convert_recursive
 from ..types import Floats3d, ArgsKwargs, Padded
 
@@ -76,6 +77,7 @@ def PyTorchWrapper_v2(
     convert_outputs: Optional[Callable] = None,
     mixed_precision: bool = False,
     grad_scaler: Optional[PyTorchGradScaler] = None,
+    device: Optional["torch.device"] = None,
 ) -> Model[Any, Any]:
     """Wrap a PyTorch model, so that it has the same API as Thinc models.
     To optimize the model, you'll need to create a PyTorch optimizer and call
@@ -105,6 +107,10 @@ def PyTorchWrapper_v2(
         The gradient scaler to use for mixed-precision training. If this
         argument is set to "None" and mixed precision is enabled, a gradient
         scaler with the default configuration is used.
+    device:
+        The PyTorch device to run the model on. When this argument is
+        set to "None", the default device for the currently active Thinc
+        ops is used.
     """
     if convert_inputs is None:
         convert_inputs = convert_pytorch_default_inputs
@@ -116,7 +122,10 @@ def PyTorchWrapper_v2(
         attrs={"convert_inputs": convert_inputs, "convert_outputs": convert_outputs},
         shims=[
             PyTorchShim(
-                pytorch_model, mixed_precision=mixed_precision, grad_scaler=grad_scaler
+                pytorch_model,
+                mixed_precision=mixed_precision,
+                grad_scaler=grad_scaler,
+                device=device,
             )
         ],
         dims={"nI": None, "nO": None},
@@ -149,7 +158,8 @@ def backprop(dY: Any) -> Any:
 def convert_pytorch_default_inputs(
     model: Model, X: Any, is_train: bool
 ) -> Tuple[ArgsKwargs, Callable[[ArgsKwargs], Any]]:
-    xp2torch_ = lambda x: xp2torch(x, requires_grad=is_train)
+    shim = cast(PyTorchShim, model.shims[0])
+    xp2torch_ = lambda x: xp2torch(x, requires_grad=is_train, device=shim.device)
     converted = convert_recursive(is_xp_array, xp2torch_, X)
     if isinstance(converted, ArgsKwargs):
 
@@ -181,11 +191,14 @@ def reverse_conversion(dXtorch):
 
 
 def convert_pytorch_default_outputs(model: Model, X_Ytorch: Any, is_train: bool):
+    shim = cast(PyTorchShim, model.shims[0])
     X, Ytorch = X_Ytorch
     Y = convert_recursive(is_torch_array, torch2xp, Ytorch)
 
     def reverse_conversion(dY: Any) -> ArgsKwargs:
-        dYtorch = convert_recursive(is_xp_array, xp2torch, dY)
+        dYtorch = convert_recursive(
+            is_xp_array, partial(xp2torch, device=shim.device), dY
+        )
         return ArgsKwargs(args=((Ytorch,),), kwargs={"grad_tensors": dYtorch})
 
     return Y, reverse_conversion
@@ -195,6 +208,7 @@ def reverse_conversion(dY: Any) -> ArgsKwargs:
 
 
 def convert_rnn_inputs(model: Model, Xp: Padded, is_train: bool):
+    shim = cast(PyTorchShim, model.shims[0])
     size_at_t = Xp.size_at_t
     lengths = Xp.lengths
     indices = Xp.indices
@@ -203,15 +217,19 @@ def convert_from_torch_backward(d_inputs: ArgsKwargs) -> Padded:
         dX = torch2xp(d_inputs.args[0])
         return Padded(dX, size_at_t, lengths, indices)  # type: ignore
 
-    output = ArgsKwargs(args=(xp2torch(Xp.data, requires_grad=True), None), kwargs={})
+    output = ArgsKwargs(
+        args=(xp2torch(Xp.data, requires_grad=True, device=shim.device), None),
+        kwargs={},
+    )
     return output, convert_from_torch_backward
 
 
 def convert_rnn_outputs(model: Model, inputs_outputs: Tuple, is_train):
+    shim = cast(PyTorchShim, model.shims[0])
     Xp, (Ytorch, _) = inputs_outputs
 
     def convert_for_torch_backward(dYp: Padded) -> ArgsKwargs:
-        dYtorch = xp2torch(dYp.data, requires_grad=True)
+        dYtorch = xp2torch(dYp.data, requires_grad=True, device=shim.device)
         return ArgsKwargs(args=(Ytorch,), kwargs={"grad_tensors": dYtorch})
 
     Y = cast(Floats3d, torch2xp(Ytorch))
diff --git a/thinc/shims/pytorch.py b/thinc/shims/pytorch.py
index 01df1d5f5..81a2fe11f 100644
--- a/thinc/shims/pytorch.py
+++ b/thinc/shims/pytorch.py
@@ -5,6 +5,7 @@
 import srsly
 
 from ..util import torch2xp, xp2torch, convert_recursive, iterate_recursive
+from ..util import get_torch_default_device
 from ..compat import torch
 from ..backends import get_current_ops, context_pools, CupyOps
 from ..backends import set_gpu_allocator
@@ -25,6 +26,10 @@ class PyTorchShim(Shim):
         The gradient scaler to use for mixed-precision training. If this
         argument is set to "None" and mixed precision is enabled, a gradient
         scaler with the default configuration is used.
+    device:
+        The PyTorch device to run the model on. When this argument is
+        set to "None", the default device for the currently active Thinc
+        ops is used.
     """
 
     def __init__(
@@ -34,12 +39,20 @@ def __init__(
         optimizer: Any = None,
         mixed_precision: bool = False,
         grad_scaler: Optional[PyTorchGradScaler] = None,
+        device: Optional["torch.device"] = None,
     ):
         super().__init__(model, config, optimizer)
 
+        if device is None:
+            device = get_torch_default_device()
+        if model is not None:
+            model.to(device)
+
         if grad_scaler is None:
             grad_scaler = PyTorchGradScaler(mixed_precision)
 
+        grad_scaler.to_(device)
+
         self._grad_scaler = grad_scaler
 
         self._mixed_precision = mixed_precision
@@ -58,6 +71,14 @@ def __call__(self, inputs, is_train):
         else:
             return self.predict(inputs), lambda a: ...
 
+    @property
+    def device(self):
+        p = next(self._model.parameters(), None)
+        if p is None:
+            return get_torch_default_device()
+        else:
+            return p.device
+
     def predict(self, inputs: ArgsKwargs) -> Any:
         """Pass inputs through to the underlying PyTorch model, and return the
         output. No conversions are performed. The PyTorch model is set into
@@ -126,7 +147,9 @@ def finish_update(self, optimizer: Optimizer):
                         cast(FloatsXd, torch2xp(torch_data.data)),
                         cast(FloatsXd, torch2xp(torch_data.grad)),
                     )
-                    torch_data.data = xp2torch(param, requires_grad=True)
+                    torch_data.data = xp2torch(
+                        param, requires_grad=True, device=torch_data.device
+                    )
                 torch_data.grad.zero_()
 
         self._grad_scaler.update()
@@ -137,7 +160,7 @@ def use_params(self, params):
         state_dict = {}
         for k, v in params.items():
             if hasattr(k, "startswith") and k.startswith(key_prefix):
-                state_dict[k.replace(key_prefix, "")] = xp2torch(v)
+                state_dict[k.replace(key_prefix, "")] = xp2torch(v, device=self.device)
         if state_dict:
             backup = {k: v.clone() for k, v in self._model.state_dict().items()}
             self._model.load_state_dict(state_dict)
@@ -164,17 +187,12 @@ def to_bytes(self):
         return srsly.msgpack_dumps(msg)
 
     def from_bytes(self, bytes_data):
-        ops = get_current_ops()
+        device = get_torch_default_device()
         msg = srsly.msgpack_loads(bytes_data)
         self.cfg = msg["config"]
         filelike = BytesIO(msg["state"])
         filelike.seek(0)
-        if ops.device_type == "cpu":
-            map_location = "cpu"
-        else:  # pragma: no cover
-            device_id = torch.cuda.current_device()
-            map_location = "cuda:%d" % device_id
-        self._model.load_state_dict(torch.load(filelike, map_location=map_location))
-        self._model.to(map_location)
-        self._grad_scaler.to_(map_location)
+        self._model.load_state_dict(torch.load(filelike, map_location=device))
+        self._model.to(device)
+        self._grad_scaler.to_(device)
         return self
diff --git a/thinc/tests/layers/test_pytorch_wrapper.py b/thinc/tests/layers/test_pytorch_wrapper.py
index fc4396370..d2eeaeb97 100644
--- a/thinc/tests/layers/test_pytorch_wrapper.py
+++ b/thinc/tests/layers/test_pytorch_wrapper.py
@@ -1,21 +1,37 @@
 from thinc.api import Linear, SGD, PyTorchWrapper, PyTorchWrapper_v2
 from thinc.api import xp2torch, torch2xp, ArgsKwargs, use_ops
 from thinc.api import chain, get_current_ops, Relu
+from thinc.api import CupyOps, MPSOps, NumpyOps
 from thinc.backends import context_pools
 from thinc.shims.pytorch_grad_scaler import PyTorchGradScaler
-from thinc.compat import has_torch, has_torch_amp, has_torch_gpu
-from thinc.compat import has_cupy
+from thinc.compat import has_torch, has_torch_amp
+from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
 import numpy
 import pytest
+from thinc.util import get_torch_default_device
 
 from ..util import make_tempdir, check_input_converters
 
 
+XP_OPS = [NumpyOps()]
+if has_cupy_gpu:
+    XP_OPS.append(CupyOps())
+if has_torch_mps_gpu:
+    XP_OPS.append(MPSOps())
+
+
 if has_torch_amp:
     TORCH_MIXED_PRECISION = [False, True]
 else:
     TORCH_MIXED_PRECISION = [False]
 
+XP_OPS_MIXED = [
+    (ops, mixed)
+    for ops in XP_OPS
+    for mixed in TORCH_MIXED_PRECISION
+    if not mixed or isinstance(ops, CupyOps)
+]
+
 
 def check_learns_zero_output(model, sgd, X, Y):
     """Check we can learn to output a zero vector"""
@@ -64,24 +80,25 @@ def test_pytorch_wrapper(nN, nI, nO):
     assert isinstance(model.predict(X), numpy.ndarray)
 
 
-@pytest.mark.skipif(
-    not has_cupy or not has_torch_gpu, reason="needs PyTorch with CUDA-capable GPU"
-)
+@pytest.mark.skipif(not has_torch, reason="needs PyTorch")
+@pytest.mark.parametrize("ops_mixed", XP_OPS_MIXED)
 @pytest.mark.parametrize("nN,nI,nO", [(2, 3, 4)])
-@pytest.mark.parametrize("mixed_precision", TORCH_MIXED_PRECISION)
-def test_pytorch_wrapper_thinc_input(nN, nI, nO, mixed_precision):
+def test_pytorch_wrapper_thinc_input(ops_mixed, nN, nI, nO):
     import torch.nn
 
-    with use_ops("cupy"):
+    ops, mixed_precision = ops_mixed
+
+    with use_ops(ops.name):
         ops = get_current_ops()
         pytorch_layer = torch.nn.Linear(nO, nO)
         # Initialize with large weights to trigger overflow of FP16 in
         # mixed-precision training.
         torch.nn.init.uniform_(pytorch_layer.weight, 9.0, 11.0)
+        device = get_torch_default_device()
         model = chain(
             Relu(),
             PyTorchWrapper_v2(
-                pytorch_layer.cuda(),
+                pytorch_layer.to(device),
                 mixed_precision=mixed_precision,
                 grad_scaler=PyTorchGradScaler(
                     enabled=mixed_precision, init_scale=2.0**16
@@ -89,7 +106,8 @@ def test_pytorch_wrapper_thinc_input(nN, nI, nO, mixed_precision):
             ).initialize(),
         )
         # pytorch allocator is set in PyTorchShim
-        assert "pytorch" in context_pools.get()
+        if isinstance(ops, CupyOps):
+            assert "pytorch" in context_pools.get()
         sgd = SGD(0.001)
         X = ops.xp.zeros((nN, nI), dtype="f")
         X += ops.xp.random.uniform(size=X.size).reshape(X.shape)
diff --git a/thinc/tests/regression/test_issue564.py b/thinc/tests/regression/test_issue564.py
index bd046c501..94ecc6e63 100644
--- a/thinc/tests/regression/test_issue564.py
+++ b/thinc/tests/regression/test_issue564.py
@@ -1,11 +1,11 @@
 import pytest
 
 from thinc.api import CupyOps
-from thinc.compat import has_torch, has_torch_gpu
+from thinc.compat import has_torch, has_torch_cuda_gpu
 
 
 @pytest.mark.skipif(not has_torch, reason="needs PyTorch")
-@pytest.mark.skipif(not has_torch_gpu, reason="needs a GPU")
+@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs a GPU")
 def test_issue564():
     import torch
 
diff --git a/thinc/tests/shims/test_pytorch_grad_scaler.py b/thinc/tests/shims/test_pytorch_grad_scaler.py
index 0fd709fdc..2ab0fa738 100644
--- a/thinc/tests/shims/test_pytorch_grad_scaler.py
+++ b/thinc/tests/shims/test_pytorch_grad_scaler.py
@@ -2,7 +2,7 @@
 
 from hypothesis import given, settings
 from hypothesis.strategies import lists, one_of, tuples
-from thinc.compat import has_torch, has_torch_amp, has_torch_gpu, torch
+from thinc.compat import has_torch, has_torch_amp, has_torch_cuda_gpu, torch
 from thinc.util import is_torch_array
 from thinc.api import PyTorchGradScaler
 
@@ -14,7 +14,7 @@ def tensors():
 
 
 @pytest.mark.skipif(not has_torch, reason="needs PyTorch")
-@pytest.mark.skipif(not has_torch_gpu, reason="needs a GPU")
+@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs a GPU")
 @pytest.mark.skipif(
     not has_torch_amp, reason="requires PyTorch with mixed-precision support"
 )
@@ -37,7 +37,7 @@ def test_scale_random_inputs(X):
 
 
 @pytest.mark.skipif(not has_torch, reason="needs PyTorch")
-@pytest.mark.skipif(not has_torch_gpu, reason="needs a GPU")
+@pytest.mark.skipif(not has_torch_cuda_gpu, reason="needs a GPU")
 @pytest.mark.skipif(
     not has_torch_amp, reason="requires PyTorch with mixed-precision support"
 )
diff --git a/thinc/util.py b/thinc/util.py
index 6ace7e7ed..e46c62447 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -14,7 +14,8 @@
 from contextvars import ContextVar
 from dataclasses import dataclass
 from .compat import has_cupy, has_mxnet, has_torch, has_tensorflow
-from .compat import has_cupy_gpu, has_torch_gpu
+from .compat import has_cupy_gpu, has_torch_cuda_gpu, has_gpu
+from .compat import has_torch_mps_gpu
 from .compat import torch, cupy, tensorflow as tf, mxnet as mx, cupy_from_dlpack
 
 DATA_VALIDATION: ContextVar[bool] = ContextVar("DATA_VALIDATION", default=False)
@@ -27,6 +28,24 @@
     from .api import Ops
 
 
+def get_torch_default_device() -> "torch.device":
+    if torch is None:
+        raise ValueError("Cannot get default Torch device when Torch is not available.")
+
+    from .backends import get_current_ops
+    from .backends.cupy_ops import CupyOps
+    from .backends.mps_ops import MPSOps
+
+    ops = get_current_ops()
+    if isinstance(ops, CupyOps):
+        device_id = torch.cuda.current_device()
+        return torch.device(f"cuda:{device_id}")
+    elif isinstance(ops, MPSOps):
+        return torch.device("mps")
+
+    return torch.device("cpu")
+
+
 def get_array_module(arr):  # pragma: no cover
     if is_cupy_array(arr):
         return cupy
@@ -35,7 +54,7 @@ def get_array_module(arr):  # pragma: no cover
 
 
 def gpu_is_available():
-    return has_cupy_gpu
+    return has_gpu
 
 
 def fix_random_seed(seed: int = 0) -> None:  # pragma: no cover
@@ -46,7 +65,7 @@ def fix_random_seed(seed: int = 0) -> None:  # pragma: no cover
         torch.manual_seed(seed)
     if has_cupy_gpu:
         cupy.random.seed(seed)
-        if has_torch and has_torch_gpu:
+        if has_torch and has_torch_cuda_gpu:
             torch.cuda.manual_seed_all(seed)
             torch.backends.cudnn.deterministic = True
             torch.backends.cudnn.benchmark = False
@@ -84,10 +103,18 @@ def is_torch_array(obj: Any) -> bool:  # pragma: no cover
         return False
 
 
-def is_torch_gpu_array(obj: Any) -> bool:  # pragma: no cover
+def is_torch_cuda_array(obj: Any) -> bool:  # pragma: no cover
     return is_torch_array(obj) and obj.is_cuda
 
 
+def is_torch_gpu_array(obj: Any) -> bool:  # pragma: no cover
+    return is_torch_cuda_array(obj) or is_torch_mps_array(obj)
+
+
+def is_torch_mps_array(obj: Any) -> bool:  # pragma: no cover
+    return is_torch_array(obj) and hasattr(obj, "is_mps") and obj.is_mps
+
+
 def is_tensorflow_array(obj: Any) -> bool:  # pragma: no cover
     if not has_tensorflow:
         return False
@@ -131,9 +158,8 @@ def set_active_gpu(gpu_id: int) -> "cupy.cuda.Device":  # pragma: no cover
     device = cupy.cuda.device.Device(gpu_id)
     device.use()
 
-    if has_torch_gpu:
+    if has_torch_cuda_gpu:
         torch.cuda.set_device(gpu_id)
-        torch.set_default_tensor_type("torch.cuda.FloatTensor")
 
     return device
 
@@ -144,28 +170,29 @@ def require_cpu() -> bool:  # pragma: no cover
 
     ops = get_ops("cpu")
     set_current_ops(ops)
-    set_torch_tensor_type_for_ops(ops)
 
     return True
 
 
 def prefer_gpu(gpu_id: int = 0) -> bool:  # pragma: no cover
     """Use GPU if it's available. Returns True if so, False otherwise."""
-    if not has_cupy_gpu:
-        return False
-    else:
+    if has_gpu:
         require_gpu(gpu_id=gpu_id)
-        return True
+    return has_gpu
 
 
 def require_gpu(gpu_id: int = 0) -> bool:  # pragma: no cover
-    from .backends import set_current_ops, CupyOps
+    from .backends import set_current_ops, CupyOps, MPSOps
 
-    if not has_cupy_gpu:
-        raise ValueError("No CUDA GPU devices detected")
+    if not has_gpu:
+        raise ValueError("No GPU devices detected")
+
+    if has_cupy_gpu:
+        set_current_ops(CupyOps())
+        set_active_gpu(gpu_id)
+    else:
+        set_current_ops(MPSOps())
 
-    set_current_ops(CupyOps())
-    set_active_gpu(gpu_id)
     return True
 
 
@@ -307,10 +334,16 @@ def iterate_recursive(is_match: Callable[[Any], bool], obj: Any) -> Any:
 
 
 def xp2torch(
-    xp_tensor: ArrayXd, requires_grad: bool = False
+    xp_tensor: ArrayXd,
+    requires_grad: bool = False,
+    device: Optional["torch.device"] = None,
 ) -> "torch.Tensor":  # pragma: no cover
     """Convert a numpy or cupy tensor to a PyTorch tensor."""
     assert_pytorch_installed()
+
+    if device is None:
+        device = get_torch_default_device()
+
     if hasattr(xp_tensor, "toDlpack"):
         dlpack_tensor = xp_tensor.toDlpack()  # type: ignore
         torch_tensor = torch.utils.dlpack.from_dlpack(dlpack_tensor)
@@ -318,8 +351,12 @@ def xp2torch(
         torch_tensor = torch.utils.dlpack.from_dlpack(xp_tensor)
     else:
         torch_tensor = torch.from_numpy(xp_tensor)
+
+    torch_tensor = torch_tensor.to(device)
+
     if requires_grad:
         torch_tensor.requires_grad_()
+
     return torch_tensor
 
 
@@ -332,14 +369,14 @@ def torch2xp(
     from .api import NumpyOps
 
     assert_pytorch_installed()
-    if is_torch_gpu_array(torch_tensor):
+    if is_torch_cuda_array(torch_tensor):
         if isinstance(ops, NumpyOps):
             return torch_tensor.detach().cpu().numpy()
         else:
             return cupy_from_dlpack(torch.utils.dlpack.to_dlpack(torch_tensor))
     else:
         if isinstance(ops, NumpyOps) or ops is None:
-            return torch_tensor.detach().numpy()
+            return torch_tensor.detach().cpu().numpy()
         else:
             return cupy.asarray(torch_tensor)
 
@@ -531,22 +568,6 @@ def use_nvtx_range(message: str, id_color: int = -1):
         yield
 
 
-def set_torch_tensor_type_for_ops(ops):
-    """Set the PyTorch default tensor type for the given ops. This is a
-    no-op if PyTorch is not available."""
-    from .backends.cupy_ops import CupyOps
-
-    try:
-        import torch
-
-        if CupyOps.xp is not None and isinstance(ops, CupyOps):
-            torch.set_default_tensor_type("torch.cuda.FloatTensor")
-        else:
-            torch.set_default_tensor_type("torch.FloatTensor")
-    except ImportError:
-        pass
-
-
 @dataclass
 class ArrayInfo:
     """Container for info for checking array compatibility."""
@@ -571,6 +592,7 @@ def check_consistency(self, arr: ArrayXd):
 
 __all__ = [
     "get_array_module",
+    "get_torch_default_device",
     "fix_random_seed",
     "is_cupy_array",
     "is_numpy_array",
@@ -588,6 +610,5 @@ def check_consistency(self, arr: ArrayXd):
     "DataValidationError",
     "make_tempfile",
     "use_nvtx_range",
-    "set_torch_tensor_type_for_ops",
     "ArrayInfo",
 ]

From 0711e60081d9a1ba591d9f7c2daef16f7056b1c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 15 Jun 2022 11:08:48 +0200
Subject: [PATCH 28/78] Expose `get_torch_default_device` through `thinc.api`
 (#698)

---
 thinc/api.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/thinc/api.py b/thinc/api.py
index 7c8702cd3..64ed11703 100644
--- a/thinc/api.py
+++ b/thinc/api.py
@@ -16,6 +16,7 @@
 from .util import DataValidationError, data_validation
 from .util import to_categorical, get_width, get_array_module, to_numpy
 from .util import torch2xp, xp2torch, tensorflow2xp, xp2tensorflow, mxnet2xp, xp2mxnet
+from .util import get_torch_default_device
 from .compat import has_cupy
 from .backends import get_ops, set_current_ops, get_current_ops, use_ops
 from .backends import Ops, CupyOps, MPSOps, NumpyOps, set_gpu_allocator

From cde169bb74af50ccf7e93aaa54e3b259397f4055 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 15 Jun 2022 14:41:45 +0200
Subject: [PATCH 29/78] Make `CBlas` methods standalone functions to avoid
 using vtables (#700)

* Make CBlas methods standalone functions to avoid using vtables

When testing #696, we found that adding new CBlas methods results in an
ABI compatibility. This would mean that every time we add a CBlas
method, we also have to rebuild spaCy.

The ABI incompatibility occurs because Cython generates a vtable for
cdef methods, even when the class or its methods are final. This vtable
is used by the caller to look up the (address of) the methods. When
methods are added, the vtable of the caller is out-of-sync when the
calling code is not recompiled.

This change works around this issue by making the methods of CBlas
standalone functions.

* Add link to PR in comments

For future reference.
---
 thinc/backends/cblas.pxd | 16 ++++++++++++----
 thinc/backends/cblas.pyx | 16 ++++++++--------
 2 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/thinc/backends/cblas.pxd b/thinc/backends/cblas.pxd
index fcc867b68..05320af36 100644
--- a/thinc/backends/cblas.pxd
+++ b/thinc/backends/cblas.pxd
@@ -18,7 +18,15 @@ cdef struct BlasFuncs
 
 cdef class CBlas:
     cdef shared_ptr[BlasFuncs] ptr
-    cdef saxpy_ptr saxpy(self) nogil
-    cdef sgemm_ptr sgemm(self) nogil
-    cdef void set_saxpy(self, saxpy_ptr saxpy) nogil
-    cdef void set_sgemm(self, sgemm_ptr sgemm) nogil
+
+
+# Note: the following functions are intentionally standalone. If we make them
+# methods of CBlas, Cython will generate and use a vtable. This makes it
+# impossible to add new BLAS functions later without breaking the ABI.
+#
+# See https://github.com/explosion/thinc/pull/700 for more information.
+
+cdef saxpy_ptr saxpy(CBlas cblas) nogil
+cdef sgemm_ptr sgemm(CBlas cblas) nogil
+cdef void set_saxpy(CBlas cblas, saxpy_ptr saxpy) nogil
+cdef void set_sgemm(CBlas cblas, sgemm_ptr sgemm) nogil
diff --git a/thinc/backends/cblas.pyx b/thinc/backends/cblas.pyx
index db18b83eb..23e02f0f0 100644
--- a/thinc/backends/cblas.pyx
+++ b/thinc/backends/cblas.pyx
@@ -19,14 +19,14 @@ cdef class CBlas:
         funcs.sgemm = blis.cy.sgemm
         self.ptr = make_shared[BlasFuncs](funcs)
 
-    cdef saxpy_ptr saxpy(self) nogil:
-        return deref(self.ptr).saxpy
+cdef saxpy_ptr saxpy(CBlas cblas) nogil:
+    return deref(cblas.ptr).saxpy
 
-    cdef sgemm_ptr sgemm(self) nogil:
-        return deref(self.ptr).sgemm
+cdef sgemm_ptr sgemm(CBlas cblas) nogil:
+    return deref(cblas.ptr).sgemm
 
-    cdef void set_saxpy(self, saxpy_ptr saxpy) nogil:
-        deref(self.ptr).saxpy = saxpy
+cdef void set_saxpy(CBlas cblas, saxpy_ptr saxpy) nogil:
+    deref(cblas.ptr).saxpy = saxpy
 
-    cdef void set_sgemm(self, sgemm_ptr sgemm) nogil:
-        deref(self.ptr).sgemm = sgemm
+cdef void set_sgemm(CBlas cblas, sgemm_ptr sgemm) nogil:
+    deref(cblas.ptr).sgemm = sgemm

From 2ae21257108192451b55094b200dc1f252447c9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 15 Jun 2022 15:25:27 +0200
Subject: [PATCH 30/78] Add Dockerfile for building the website (#699)

* Add Dockerfile for building the website

This Dockerfile was taken from spaCy.

* README: Remove command substitution in example
---
 website/Dockerfile | 16 ++++++++++++++++
 website/README.md  | 24 ++++++++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 website/Dockerfile

diff --git a/website/Dockerfile b/website/Dockerfile
new file mode 100644
index 000000000..747b1f344
--- /dev/null
+++ b/website/Dockerfile
@@ -0,0 +1,16 @@
+FROM node:11.15.0
+
+WORKDIR /thinc-io
+
+RUN npm install -g gatsby-cli@2.7.4
+
+COPY package.json .
+COPY package-lock.json .
+
+RUN npm install
+
+# This is so the installed node_modules will be up one directory
+# from where a user mounts files, so that they don't accidentally mount
+# their own node_modules from a different build
+# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
+WORKDIR /thinc-io/website/
diff --git a/website/README.md b/website/README.md
index 8c8d53a98..1e3b10ce5 100644
--- a/website/README.md
+++ b/website/README.md
@@ -14,6 +14,30 @@ npm run dev  # start dev server
 A `.prettierrc` is included in the repo, so if you set up auto-formatting with
 Prettier, it should match the style.
 
+## Build and run the website in a Docker container
+
+Rather than installing NPM locally, you can also build a Docker container with
+the prerequisite dependencies:
+
+```bash
+docker build -t thinc-io .
+```
+
+Afterwards, the website can be built and run in the container:
+
+```bash
+docker run --rm -it \
+  -v $PWD:/thinc-io/website \
+  -p 8000:8000 \
+  thinc-io \
+  gatsby develop -H 0.0.0.0
+```
+
+This is currently the only way to build the website on ARM64 Macs, since the
+required Node.js version is not built for macOS/ARM64.
+
+These commands also work with Podman by replacing `docker` by `podman`.
+
 ## Directory structure
 
 - `/docs`: Docs pages as Markdown.

From 3216d7c681d1054ef6d2f957b0c4564abc9924a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 15 Jun 2022 15:25:44 +0200
Subject: [PATCH 31/78] Bump version to v8.1.0.dev2 (#701)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 4c2cc699f..feb533da8 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "v8.1.0.dev1"
+__version__ = "v8.1.0.dev2"
 __release__ = True

From 3f0082f2c06d220dd6fb629edaa75acbfdec9ab0 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 22 Jun 2022 17:42:44 +0200
Subject: [PATCH 32/78] Use blis~=0.7.8 (#704)

Until the haswell bug is fixed in BLIS v0.9, switch back to blis~=0.7.8.
---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.cfg        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3b973098d..d3fb69b76 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "murmurhash>=1.0.2,<1.1.0",
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
-    "blis>=0.9.0,<0.10.0",
+    "blis>=0.7.8,<0.8.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 5f35d8e91..02817ca87 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 murmurhash>=1.0.2,<1.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-blis>=0.9.0,<0.10.0
+blis>=0.7.8,<0.8.0
 srsly>=2.4.0,<3.0.0
 wasabi>=0.8.1,<1.1.0
 catalogue>=2.0.4,<2.1.0
diff --git a/setup.cfg b/setup.cfg
index adc3dac92..0da060213 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -35,10 +35,10 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=1.0.2,<1.1.0
-    blis>=0.9.0,<0.10.0
+    blis>=0.7.8,<0.8.0
 install_requires =
     # Explosion-provided dependencies
-    blis>=0.9.0,<0.10.0
+    blis>=0.7.8,<0.8.0
     murmurhash>=1.0.2,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0

From 8236b8943224a0be1caf52d9a66df9adc1ca066f Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 22 Jun 2022 20:28:28 +0200
Subject: [PATCH 33/78] Set version to v8.1.0.dev3 (#705)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index feb533da8..3c12247e3 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "v8.1.0.dev2"
+__version__ = "v8.1.0.dev3"
 __release__ = True

From f6302709798b759a90e9f5e2ad1adb037748974c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 23 Jun 2022 08:44:12 +0200
Subject: [PATCH 34/78] Speed up HashEmbed layer by avoiding large temporary
 arrays (#696)

* Speed up HashEmbed layer by avoiding large temporary arrays

The HashEmbed layer sums up keyed embeddings. For instance, a key matrix
of the shape (50000, 4) will result in 50,000 embeddings, each computed
by summing 4 embeddings. The HashEmbed layer computed the embeddings as
follows:

vectors[keys].sum(axis=1)

where `vectors` is an embedding matrix. However, this way of computing
embeddings results in very large allocations. Suppose that `vectors`
is (4000, 64). Even though the final embedding matrix is (50000, 64),
the first expression will construct a temporary array of shape
(50000, 4, 64).

This change avoids this by introducing a `gather_add` op as a
counterpart to `scatter_add`. In this particular example, the `NumpyOps`
implementation only allocates the final (50000, 64) array, computing
the embeddings in-place using the BLAS saxpy function.

In benchmarks with an M1 Max on de_core_news_lg, this improved
processing speed from 40511 WPS to 45591 (12.5% faster).

* Simplify saxpy call

* Fixup types

* NumpyOps.gather_add: add support for double

* NumpyOps.gather_add: support int and unsigned int indices

* Add gather_add CUDA kernel

* Add tests for gather_add

* Comment fixup

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* api-backends: document Ops.gather_add

* Ops.gather_add: arguments should be 2D arrays

* Comment fix

* Ops.gather_add returns Float2d

* docs: Ops.gather_add is new in 8.1

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 thinc/backends/_custom_kernels.cu | 20 +++++++++++++++++
 thinc/backends/_custom_kernels.py | 37 +++++++++++++++++++++++++++++++
 thinc/backends/cblas.pxd          |  6 +++++
 thinc/backends/cblas.pyx          |  8 +++++++
 thinc/backends/cpu_kernels.hh     | 30 +++++++++++++++++++++++++
 thinc/backends/cupy_ops.py        |  6 +++++
 thinc/backends/numpy_ops.pxd      | 13 +++++++++++
 thinc/backends/numpy_ops.pyx      | 19 +++++++++++++++-
 thinc/backends/ops.py             |  3 +++
 thinc/layers/hashembed.py         |  2 +-
 thinc/tests/backends/test_ops.py  | 34 +++++++++++++++++++++++++++-
 website/docs/api-backends.md      | 23 +++++++++++++++++--
 12 files changed, 196 insertions(+), 5 deletions(-)

diff --git a/thinc/backends/_custom_kernels.cu b/thinc/backends/_custom_kernels.cu
index f0cebd77d..a0017c0e4 100644
--- a/thinc/backends/_custom_kernels.cu
+++ b/thinc/backends/_custom_kernels.cu
@@ -22,6 +22,26 @@ struct Constants<float> {
 };
 
 
+template <typename U>
+__global__ void gather_add(U* out_bo, const U* table_to, const int* indices_bk,
+        int T, int O, int B, int K)
+{
+    int _loop_start = blockIdx.x * blockDim.x + threadIdx.x;
+    int _loop_stride = blockDim.x * gridDim.x;
+
+    for (int b = _loop_start; b < B; b += _loop_stride) {
+        for (int k = 0; k < K; ++k) {
+            int idx = indices_bk[b * K + k];
+            const U* table = table_to + idx * O;
+            U* out = out_bo + b * O;
+            for (int o = 0; o < O; ++o) {
+                out[o] += table[o];
+            }
+        }
+    }
+}
+
+
 template <typename T>
 __global__ void seq2col(T* output, const T* X, const int* lengths,
         int nW, int B, int I, int nL)
diff --git a/thinc/backends/_custom_kernels.py b/thinc/backends/_custom_kernels.py
index 8ea13bb57..56a23547f 100644
--- a/thinc/backends/_custom_kernels.py
+++ b/thinc/backends/_custom_kernels.py
@@ -32,6 +32,8 @@
     "backprop_swish<float>",
     "clipped_linear<double>",
     "clipped_linear<float>",
+    "gather_add<double>",
+    "gather_add<float>",
     "gelu<double>",
     "gelu<float>",
     "maxout<double>",
@@ -76,6 +78,8 @@ def compile_mmh(src):
 
 clipped_linear_kernel_float = _get_kernel("clipped_linear<float>")
 clipped_linear_kernel_double = _get_kernel("clipped_linear<double>")
+gather_add_kernel_float = _get_kernel("gather_add<float>")
+gather_add_kernel_double = _get_kernel("gather_add<double>")
 gelu_kernel_float = _get_kernel("gelu<float>")
 gelu_kernel_double = _get_kernel("gelu<double>")
 hash_data_kernel = compile_mmh(MMH_SRC)
@@ -165,6 +169,32 @@ def clipped_linear(
     return out
 
 
+def gather_add(table, indices, *, threads_per_block=128, num_blocks=128):
+    if table.ndim != 2:
+        raise ValueError(f"gather_add expects table with dimensionality 2, was: {table.ndim}")
+    if indices.ndim != 2:
+        raise ValueError(f"gather_add expects indices with dimensionality 2, was: {indices.ndim}")
+    _is_float_array(table)
+    indices = indices.astype("int32")
+    _check_indices(indices, table.shape[0])
+
+    B = indices.shape[0]
+    K = indices.shape[1]
+    T = table.shape[0]
+    O = table.shape[1]
+
+    out = _alloc((B, O), dtype=table.dtype, zeros=True)
+    if table.dtype == "float32":
+        gather_add_kernel_float(
+            (num_blocks,), (threads_per_block,), (out, table, indices, T, O, B, K)
+        )
+    else:
+        gather_add_kernel_double(
+            (num_blocks,), (threads_per_block,), (out, table, indices, T, O, B, K)
+        )
+    return out
+
+
 def gelu(X, *, inplace=False, threshold=6.0, threads_per_block=128, num_blocks=128):
     _is_float_array(X)
 
@@ -647,6 +677,13 @@ def _check_lengths(lengths, n_elems: int, *, min_length=0):
         raise IndexError("lengths must sum up to the batch size")
 
 
+def _check_indices(indices, n: int):
+    assert indices.dtype == "int32", "indices should be encoded as 32-bit integers"
+
+    if not _values_within_range(indices, 0, n):
+        raise IndexError(f"index out of bounds, must be >= 0 && < {n}")
+
+
 def _check_which_maxout(which, B: int, I: int, P: int):
     shape = (B, I)
     msg = "maximum index (which) should be encoded as 32-bit integers"
diff --git a/thinc/backends/cblas.pxd b/thinc/backends/cblas.pxd
index 05320af36..15837e5e7 100644
--- a/thinc/backends/cblas.pxd
+++ b/thinc/backends/cblas.pxd
@@ -10,6 +10,10 @@ ctypedef void (*saxpy_ptr)(int N, float alpha, const float* X, int incX,
                            float *Y, int incY) nogil
 
 
+ctypedef void (*daxpy_ptr)(int N, double alpha, const double* X, int incX,
+                           double *Y, int incY) nogil
+
+
 # Forward-declaration of the BlasFuncs struct. This struct must be opaque, so
 # that consumers of the CBlas class cannot become dependent on its size or
 # ordering.
@@ -26,7 +30,9 @@ cdef class CBlas:
 #
 # See https://github.com/explosion/thinc/pull/700 for more information.
 
+cdef daxpy_ptr daxpy(CBlas cblas) nogil
 cdef saxpy_ptr saxpy(CBlas cblas) nogil
 cdef sgemm_ptr sgemm(CBlas cblas) nogil
+cdef void set_daxpy(CBlas cblas, daxpy_ptr daxpy) nogil
 cdef void set_saxpy(CBlas cblas, saxpy_ptr saxpy) nogil
 cdef void set_sgemm(CBlas cblas, sgemm_ptr sgemm) nogil
diff --git a/thinc/backends/cblas.pyx b/thinc/backends/cblas.pyx
index 23e02f0f0..9eb4514d8 100644
--- a/thinc/backends/cblas.pyx
+++ b/thinc/backends/cblas.pyx
@@ -4,6 +4,7 @@ from libcpp.memory cimport make_shared
 
 
 cdef struct BlasFuncs:
+    daxpy_ptr daxpy
     saxpy_ptr saxpy
     sgemm_ptr sgemm
 
@@ -15,16 +16,23 @@ cdef class CBlas:
         """Construct a CBlas instance set to use BLIS implementations of the
            supported BLAS functions."""
         cdef BlasFuncs funcs
+        funcs.daxpy = blis.cy.daxpy
         funcs.saxpy = blis.cy.saxpy
         funcs.sgemm = blis.cy.sgemm
         self.ptr = make_shared[BlasFuncs](funcs)
 
+cdef daxpy_ptr daxpy(CBlas cblas) nogil:
+    return deref(cblas.ptr).daxpy
+
 cdef saxpy_ptr saxpy(CBlas cblas) nogil:
     return deref(cblas.ptr).saxpy
 
 cdef sgemm_ptr sgemm(CBlas cblas) nogil:
     return deref(cblas.ptr).sgemm
 
+cdef void set_daxpy(CBlas cblas, daxpy_ptr daxpy) nogil:
+    deref(cblas.ptr).daxpy = daxpy
+
 cdef void set_saxpy(CBlas cblas, saxpy_ptr saxpy) nogil:
     deref(cblas.ptr).saxpy = saxpy
 
diff --git a/thinc/backends/cpu_kernels.hh b/thinc/backends/cpu_kernels.hh
index 69e1ab334..1c1f5ac8f 100644
--- a/thinc/backends/cpu_kernels.hh
+++ b/thinc/backends/cpu_kernels.hh
@@ -8,6 +8,22 @@
 #include <string>
 #include <type_traits>
 
+// Ideally we'd use an alias declaration for a generic definition of
+// *axpy. But Cython doesn't support alias declarations yet:
+//
+// https://github.com/cython/cython/issues/3272
+//
+// template <typename T>
+// using axpy = void (*)(int N, T alpha, const T* X, int incX,
+//                       T *Y, int incY);
+//
+// So, instead we'll do this the pre-C++11 way:
+
+template <typename T>
+struct axpy {
+    typedef void (*ptr)(int N, T alpha, const T* X, int incX, T *Y, int incY);
+};
+
 
 // All elementwise functions, such as most activations, work in-place.
 
@@ -395,4 +411,18 @@ void backprop_seq2col(A* d_seqs, const A* d_cols, const L* lengths, L B, L I, L
     }
 }
 
+template <typename F, typename I, typename L>
+void cpu_gather_add(typename axpy<F>::ptr axpy, F* out_bo, const F* table_to, const I* indices_bk, L T, L O, L B, L K) {
+     for (L b = 0; b < B; ++b) {
+        for (L k = 0; k < K; ++k) {
+            I idx = indices_bk[b * K + k];
+            if (idx > T) {
+                throw std::out_of_range("Embedding index out-of-bounds");
+            }
+            axpy(O, 1.0, table_to + idx * O, 1, out_bo + b * O, 1);
+        }
+    }
+}
+
+
 #endif // CPU_KERNELS_HH
diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py
index e77d525ed..924bfe955 100644
--- a/thinc/backends/cupy_ops.py
+++ b/thinc/backends/cupy_ops.py
@@ -30,6 +30,12 @@ def to_numpy(self, data, *, byte_order=None):
             data = numpy.asarray(data, dtype=dtype)
         return data
 
+    def gather_add(self, table, indices):
+        if table.dtype in ("float32", "float64"):
+            return _custom_kernels.gather_add(table, indices)
+        else:
+            return super().gather_add(table, indices)
+
     def gelu(self, X, inplace=False):
         if X.dtype in ("float32", "float64"):
             return _custom_kernels.gelu(X, inplace=inplace, threshold=6.0)
diff --git a/thinc/backends/numpy_ops.pxd b/thinc/backends/numpy_ops.pxd
index 28bab3a31..6cf01fe76 100644
--- a/thinc/backends/numpy_ops.pxd
+++ b/thinc/backends/numpy_ops.pxd
@@ -1,7 +1,15 @@
+from .cblas cimport saxpy_ptr
+
 ctypedef double[:, ::1] double2d_t
 ctypedef double[:, :, ::1] double3d_t
 ctypedef float[:, ::1] float2d_t
 ctypedef float[:, :, ::1] float3d_t
+ctypedef int[:, ::1] int2d_t
+ctypedef unsigned int[:, ::1] uint2d_t
+
+cdef fused ints2d_ft:
+    int2d_t
+    uint2d_t
 
 cdef fused reals2d_ft:
     float2d_t
@@ -13,6 +21,9 @@ cdef fused reals3d_ft:
 
 
 cdef extern from "cpu_kernels.hh":
+    cdef cppclass axpy[T]:
+        ctypedef void (*ptr)(int N, T alpha, const T* X, int incX, T *Y, int incY);
+
     void cpu_maxout[A, L](A* best__bo, L* which__bo, const A* cands_bop,
         L B, L O, L P)
     void cpu_backprop_maxout[A, L](A* dX__bop, const A* dX__bo, const L* which__bo,
@@ -35,3 +46,5 @@ cdef extern from "cpu_kernels.hh":
     void cpu_relu[A, L](A* X, L N)
     void backprop_seq2col[A, L](A* d_seqs, const A* d_cols, const L* lengths, L B, L I, L nW, L nL)
     void seq2col[A, L](A* output, const A* X, const L* lengths, L nW, L B, L I, L nL)
+    void cpu_gather_add[F, I, L](axpy[F].ptr axpy, F* out_bo, const F* table_to, const I* indices_bk,
+                                 L T, L O, L B, L K) except +
diff --git a/thinc/backends/numpy_ops.pyx b/thinc/backends/numpy_ops.pyx
index 0938ffedf..c980e6c5d 100644
--- a/thinc/backends/numpy_ops.pyx
+++ b/thinc/backends/numpy_ops.pyx
@@ -20,7 +20,7 @@ cimport blis.cy
 from .. import registry
 from ..util import copy_array, get_array_module
 from ..types import DeviceTypes, DTypes, Shape, ArrayXd
-from .cblas cimport CBlas
+from .cblas cimport CBlas, daxpy, saxpy
 from .linalg cimport VecVec, Vec
 from .ops import Ops
 
@@ -437,6 +437,23 @@ class NumpyOps(Ops):
 
         return dX
 
+    def gather_add(self, reals2d_ft table, ints2d_ft indices):
+        cdef CBlas cblas = self.cblas()
+        rows = indices.shape[0]
+        dims = table.shape[1]
+
+        cdef np.ndarray output
+        if reals2d_ft is float2d_t:
+            output = self.xp.zeros((rows, dims), dtype="float32")
+            cpu_gather_add(saxpy(cblas), <float *>output.data, &table[0, 0], &indices[0, 0],
+                        table.shape[0], dims, rows, indices.shape[1])
+        else:
+            output = self.xp.zeros((rows, dims), dtype="float64")
+            cpu_gather_add(daxpy(cblas), <double *>output.data, &table[0, 0], &indices[0, 0],
+                        table.shape[0], dims, rows, indices.shape[1])
+
+        return output
+
     def scatter_add(self, np.ndarray table, np.ndarray indices, np.ndarray values):
         if table.dtype == 'float32' \
         and indices.dtype == 'int32' \
diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index a7015d44d..b2f8e909f 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -1254,6 +1254,9 @@ def position_encode(
         numpy_ops = NumpyOps()
         return self.asarray2f(numpy_ops.position_encode(N, D, period, out))
 
+    def gather_add(self, table: Floats2d, indices: Ints2d) -> Floats2d:
+        return table[indices].sum(axis=1)  # type: ignore[return-value]
+
     def scatter_add(
         self, table: FloatsXd, indices: IntsXd, values: FloatsXd
     ) -> FloatsXd:
diff --git a/thinc/layers/hashembed.py b/thinc/layers/hashembed.py
index 74b85c7cf..677049cc2 100644
--- a/thinc/layers/hashembed.py
+++ b/thinc/layers/hashembed.py
@@ -68,7 +68,7 @@ def forward(
         nN = ids.shape[0]
         seed: int = model.attrs["seed"]
         keys = model.ops.hash(ids, seed) % nV
-        output = vectors[keys].sum(axis=1)
+        output = model.ops.gather_add(vectors, keys)
         drop_mask = None
         if is_train:
             dropout: Optional[float] = model.attrs.get("dropout_rate")
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index c8cb6a9df..238ccc13e 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -16,7 +16,7 @@
 import inspect
 
 from .. import strategies
-from ..strategies import ndarrays_of_shape
+from ..strategies import arrays_BI, ndarrays_of_shape
 
 
 MAX_EXAMPLES = 10
@@ -198,6 +198,38 @@ def test_get_dropout_not_empty(ops):
     assert mask.shape == shape
 
 
+@pytest.mark.parametrize("ops", ALL_OPS)
+@pytest.mark.parametrize("dtype", FLOAT_TYPES)
+@pytest.mark.parametrize("index_dtype", ["int32", "uint32"])
+def test_gather_add(ops, dtype, index_dtype):
+    table = ops.xp.arange(12, dtype=dtype).reshape(4, 3)
+    indices = ops.xp.array([[0, 2], [3, 1], [0, 1]], dtype=index_dtype)
+    gathered = ops.gather_add(table, indices)
+    ops.xp.testing.assert_allclose(
+        gathered, [[6.0, 8.0, 10.0], [12.0, 14.0, 16.0], [3.0, 5.0, 7.0]]
+    )
+
+
+@pytest.mark.parametrize("ops", XP_OPS)
+@given(table=strategies.arrays_BI())
+def test_gather_add_against_numpy(ops, table):
+    table = ops.asarray(table)
+    indices = ops.xp.arange(100, dtype="i").reshape(25, 4) % table.shape[0]
+    ops.xp.testing.assert_allclose(
+        ops.gather_add(table, indices),
+        table[indices].sum(1),
+        atol=1e-5,
+    )
+
+
+@pytest.mark.parametrize("ops", ALL_OPS)
+def test_gather_add_oob_raises(ops):
+    table = ops.xp.arange(12, dtype="f").reshape(4, 3)
+    indices = ops.xp.array([[0, 2], [3, 1], [5, 1]], dtype="i")
+    with pytest.raises(IndexError):
+        ops.gather_add(table, indices)
+
+
 @pytest.mark.parametrize("ops", CPU_OPS)
 def test_seq2col_window_one_small(ops):
     seq = ops.asarray([[1.0], [3.0], [4.0], [5]], dtype="float32")
diff --git a/website/docs/api-backends.md b/website/docs/api-backends.md
index e1d90a655..5b62c02b1 100644
--- a/website/docs/api-backends.md
+++ b/website/docs/api-backends.md
@@ -1298,8 +1298,8 @@ Backpropagate the `reduce_mean` operation.
 </inline-list>
 
 Perform sequence-wise max pooling for data in the ragged format. Zero-length
-sequences are not allowed. A `ValueError` is raised if any element in
-`lengths` is zero.
+sequences are not allowed. A `ValueError` is raised if any element in `lengths`
+is zero.
 
 | Argument    | Type                             | Description                 |
 | ----------- | -------------------------------- | --------------------------- |
@@ -1364,6 +1364,25 @@ Create hashed ngram features.
 | `keys`      | <tt>Ints1d</tt> | The input sequence.                        |
 | **RETURNS** | <tt>Ints1d</tt> | The hashed ngrams.                         |
 
+### Ops.gather_add {#gather_add tag="method" new="8.1"}
+
+<inline-list>
+
+- **default:** <i name="yes"></i>
+- **numpy:** <i name="yes"></i>
+- **cupy:** <i name="yes"></i>
+
+</inline-list>
+
+Gather rows from `table` with shape `(T, O)` using array `indices` with shape
+`(B, K)`, then sum the resulting array with shape `(B, K, O)` over the `K` axis.
+
+| Argument    | Type              | Description             |
+| ----------- | ----------------- | ----------------------- |
+| `table`     | <tt>Floats2d</tt> | The array to increment. |
+| `indices`   | <tt>Ints2d</tt>   | The indices to use.     |
+| **RETURNS** | <tt>Floats2d</tt> | The summed rows.        |
+
 ### Ops.scatter_add {#scatter_add tag="method"}
 
 <inline-list>

From 0640617e15806d3a60d6cf5b2cbf5eded1a89947 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 27 Jun 2022 14:43:09 +0200
Subject: [PATCH 35/78] Auto-format code with black (#706)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 thinc/backends/_custom_kernels.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/thinc/backends/_custom_kernels.py b/thinc/backends/_custom_kernels.py
index 56a23547f..d2d1ea133 100644
--- a/thinc/backends/_custom_kernels.py
+++ b/thinc/backends/_custom_kernels.py
@@ -171,9 +171,13 @@ def clipped_linear(
 
 def gather_add(table, indices, *, threads_per_block=128, num_blocks=128):
     if table.ndim != 2:
-        raise ValueError(f"gather_add expects table with dimensionality 2, was: {table.ndim}")
+        raise ValueError(
+            f"gather_add expects table with dimensionality 2, was: {table.ndim}"
+        )
     if indices.ndim != 2:
-        raise ValueError(f"gather_add expects indices with dimensionality 2, was: {indices.ndim}")
+        raise ValueError(
+            f"gather_add expects indices with dimensionality 2, was: {indices.ndim}"
+        )
     _is_float_array(table)
     indices = indices.astype("int32")
     _check_indices(indices, table.shape[0])

From 2ef3f3af071fb60ceecdf0300eeb12a1bf593fa1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 29 Jun 2022 18:09:14 +0200
Subject: [PATCH 36/78] Fix MyPy error when Torch without MPS support is
 installed (#708)

---
 thinc/compat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/thinc/compat.py b/thinc/compat.py
index 75bfe69e6..d04ff97c3 100644
--- a/thinc/compat.py
+++ b/thinc/compat.py
@@ -34,8 +34,8 @@
     has_torch_cuda_gpu = torch.cuda.device_count() != 0
     has_torch_mps_gpu = (
         hasattr(torch, "has_mps")
-        and torch.has_mps
-        and torch.backends.mps.is_available()
+        and torch.has_mps  # type: ignore[attr-defined]
+        and torch.backends.mps.is_available()  # type: ignore[attr-defined]
     )
     has_torch_gpu = has_torch_cuda_gpu
     torch_version = Version(str(torch.__version__))

From c7b0d6759645babe94315a36c84d56ec877252f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 30 Jun 2022 13:44:45 +0200
Subject: [PATCH 37/78] Check that Torch-verified activations obey `inplace`
 (#709)

And fix some activations that do not obey the `inplace` kwarg.
---
 thinc/backends/ops.py            | 10 +++++-----
 thinc/tests/backends/test_ops.py | 13 +++++++++----
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index b2f8e909f..2ce098f6b 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -722,15 +722,15 @@ def as_contig(self, data: ArrayT, dtype: Optional[DTypes] = None) -> ArrayT:
         return self.xp.ascontiguousarray(data, **kwargs)
 
     def sigmoid(self, X: FloatsType, *, inplace: bool = False) -> FloatsType:
-        # To prevent overflows and help with regularization/numerical stability
-        X = self.xp.clip(X, -20.0, 20.0)
-
         if inplace:
+            # To prevent overflows and help with regularization/numerical stability
+            X = self.xp.clip(X, -20.0, 20.0, out=X)
             self.xp.exp(-X, out=X)
             X += 1.0  # type: ignore[assignment]
             X **= -1.0  # type: ignore[assignment]
             return cast(FloatsType, X)
         else:
+            X = self.xp.clip(X, -20.0, 20.0)
             return cast(FloatsType, 1.0 / (1.0 + self.xp.exp(-X)))
 
     def backprop_sigmoid(
@@ -909,7 +909,7 @@ def backprop_relu_k(
         return self.backprop_clipped_linear(dY, X, max_val=n, inplace=inplace)
 
     def hard_sigmoid(self, X: FloatsType, inplace: bool = False) -> FloatsType:
-        return self.clipped_linear(X, slope=0.2, offset=0.5)
+        return self.clipped_linear(X, slope=0.2, offset=0.5, inplace=inplace)
 
     def backprop_hard_sigmoid(
         self, dY: FloatsType, X: FloatsType, inplace: bool = False
@@ -917,7 +917,7 @@ def backprop_hard_sigmoid(
         return self.backprop_clipped_linear(dY, X, slope=0.2, offset=0.5)
 
     def hard_tanh(self, X: FloatsType, inplace: bool = False) -> FloatsType:
-        return self.clipped_linear(X, min_val=-1.0, max_val=1.0)
+        return self.clipped_linear(X, min_val=-1.0, max_val=1.0, inplace=inplace)
 
     def backprop_hard_tanh(
         self, dY: FloatsType, X: FloatsType, inplace: bool = False
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index 238ccc13e..4463193c2 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -1302,7 +1302,10 @@ def test_compare_activations_to_torch(ops, dtype, x, torch_func):
     y_thinc = forward(x_thinc)
     y.backward()
     assert x_thinc.dtype == y_thinc.dtype
-    assert ops.xp.isclose(y_thinc, forward(x_thinc, inplace=True), atol=1e-06)
+    assert y_thinc is not x_thinc
+    y_think_inplace = forward(x_thinc, inplace=True)
+    assert y_think_inplace is x_thinc
+    assert ops.xp.isclose(y_thinc, y_think_inplace, atol=1e-06)
     assert ops.xp.isclose(y_thinc, y.detach(), atol=1e-06)
     x_thinc = ops.asarray([x], dtype=dtype)
     dY_thinc = ops.asarray([1.0], dtype=dtype)
@@ -1314,10 +1317,12 @@ def test_compare_activations_to_torch(ops, dtype, x, torch_func):
     if params == {"dY", "X", "Y"}:
         dx_thinc = backward(dY_thinc, Y=y_thinc, X=x_thinc)
         assert dx_thinc.dtype == x_thinc.dtype
-        assert ops.xp.isclose(
-            dx_thinc,
-            backward(dY=dY_thinc_inplace, Y=y_thinc, X=x_thinc, inplace=True),
+        assert dx_thinc is not dY_thinc
+        dx_thinc_inplace = backward(
+            dY=dY_thinc_inplace, Y=y_thinc, X=x_thinc, inplace=True
         )
+        assert dx_thinc_inplace is dY_thinc_inplace
+        assert ops.xp.isclose(dx_thinc, dx_thinc_inplace)
         assert ops.xp.isclose(x_torch.grad.item(), float(dx_thinc), atol=1e-06)
     elif params == {"Y", "dY"}:
         dx_thinc = backward(dY_thinc, Y=y_thinc)

From 7cd060e677c898137b86bbfcf50841d041c80a61 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Mon, 4 Jul 2022 15:11:27 +0200
Subject: [PATCH 38/78] Increase test deadline to 30 minutes to prevent
 spurious test failures (#714)

---
 thinc/tests/conftest.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/thinc/tests/conftest.py b/thinc/tests/conftest.py
index 8c1e21f0a..14d765961 100644
--- a/thinc/tests/conftest.py
+++ b/thinc/tests/conftest.py
@@ -1,5 +1,10 @@
 import pytest
+from hypothesis import settings
 
+# Functionally disable deadline settings for tests
+# to prevent spurious test failures in CI builds.
+settings.register_profile("no_deadlines", deadline=2*60*1000) # in ms
+settings.load_profile("no_deadlines")
 
 def pytest_sessionstart(session):
     # If Tensorflow is installed, attempt to enable memory growth

From ea3c08e4666d6ae34bdfbd8d9c69abfe7454ec2d Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Wed, 6 Jul 2022 14:39:09 +0200
Subject: [PATCH 39/78] `test_mxnet_wrapper`: Feature-gate GPU test (#717)

---
 thinc/tests/layers/test_mxnet_wrapper.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/thinc/tests/layers/test_mxnet_wrapper.py b/thinc/tests/layers/test_mxnet_wrapper.py
index 78d107dd3..69b0d5253 100644
--- a/thinc/tests/layers/test_mxnet_wrapper.py
+++ b/thinc/tests/layers/test_mxnet_wrapper.py
@@ -5,7 +5,7 @@
 from thinc.api import Adam, ArgsKwargs, Model, Ops, MXNetWrapper
 from thinc.api import get_current_ops, mxnet2xp, xp2mxnet
 from thinc.types import Array2d, Array1d, IntsXd
-from thinc.compat import has_cupy, has_mxnet
+from thinc.compat import has_cupy_gpu, has_mxnet
 from thinc.util import to_categorical
 
 from ..util import check_input_converters, make_tempdir
@@ -158,7 +158,8 @@ def test_mxnet_wrapper_to_cpu(mx_model, X: Array2d):
     model.to_cpu()
 
 
-@pytest.mark.skipif(not has_mxnet or not has_cupy, reason="needs MXNet")
+@pytest.mark.skipif(not has_mxnet, reason="needs MXNet")
+@pytest.mark.skipif(not has_cupy_gpu, reason="needs GPU/cupy")
 def test_mxnet_wrapper_to_gpu(model: Model[Array2d, Array2d], X: Array2d):
     model.predict(X)
     model.to_gpu(0)

From 28b2e8c06ba9e63bbca068a047ae96fd38e3b39f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 7 Jul 2022 10:07:31 +0200
Subject: [PATCH 40/78] Add Ops.reduce_{first,last} plus tests (#710)

* Add Ops.reduce_{first,last} plus tests

* Add docs for reduce_{first,last}

* Typing fix

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>

* Typing fixes (use InT)

* Fix some some reduction issues when using CuPy

* One maxout test fails with the latest CuPy.

Values of 5.9e-39 and 0 have an infinite relative difference. Accept
with a very strict tolerance (1e-10).

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
---
 thinc/backends/ops.py            | 43 ++++++++++++++++++
 thinc/layers/reduce_first.py     | 28 +++++-------
 thinc/layers/reduce_last.py      | 25 +++++------
 thinc/tests/backends/test_ops.py | 63 ++++++++++++++++++++++++++
 website/docs/api-backends.md     | 76 ++++++++++++++++++++++++++++++++
 5 files changed, 205 insertions(+), 30 deletions(-)

diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index 2ce098f6b..6b2eb36a9 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -1157,6 +1157,29 @@ def reduce_sum(self, X: Floats2d, lengths: Ints1d) -> Floats2d:
                 Y[i] = 0.0
         return Y
 
+    def reduce_first(self, X: Floats2d, lengths: Ints1d) -> Tuple[Floats2d, Ints1d]:
+        if lengths.size == 0:
+            return self.alloc2f(0, X.shape[1]), lengths
+        if not self.xp.all(lengths > 0):
+            raise ValueError(f"all sequence lengths must be >= 0")
+        starts_ends = self.alloc1i(lengths.shape[0] + 1, zeros=False)
+        starts_ends[0] = 0
+        starts_ends[1:] = lengths.cumsum()
+        if starts_ends[-1] != X.shape[0]:
+            raise IndexError("lengths must sum up to the number of rows")
+
+        return X[starts_ends[:-1]], starts_ends
+
+    def reduce_last(self, X: Floats2d, lengths: Ints1d) -> Tuple[Floats2d, Ints1d]:
+        if lengths.size == 0:
+            return self.alloc2f(0, X.shape[1]), lengths
+        if not self.xp.all(lengths > 0):
+            raise ValueError(f"all sequence lengths must be >= 0")
+        lasts = lengths.cumsum() - 1
+        if lasts[-1] + 1 != X.shape[0]:
+            raise IndexError("lengths must sum up to the number of rows")
+        return X[lasts], lasts
+
     def reduce_mean(self, X: Floats2d, lengths: Ints1d) -> Floats2d:
         Y = self.alloc2f(lengths.shape[0], X.shape[1], zeros=False)
         start = 0
@@ -1187,6 +1210,26 @@ def reduce_max(self, X: Floats2d, lengths: Ints1d) -> Tuple[Floats2d, Ints2d]:
             start += length
         return Y, which
 
+    def backprop_reduce_first(
+        self, d_firsts: Floats2d, starts_ends: Ints1d
+    ) -> Floats2d:
+        if starts_ends.size < 2:
+            raise ValueError(f"starts_ends should least have size 2")
+        dX = self.alloc2f(
+            int(starts_ends[-1]), d_firsts.shape[1], dtype=d_firsts.dtype, zeros=True
+        )
+        dX[starts_ends[:-1]] = d_firsts
+        return dX
+
+    def backprop_reduce_last(self, d_lasts: Floats2d, lasts: Ints1d) -> Floats2d:
+        if lasts.size < 1:
+            raise ValueError(f"lasts should least have size 2")
+        dX = self.alloc2f(
+            int(lasts[-1]) + 1, d_lasts.shape[1], dtype=d_lasts.dtype, zeros=True
+        )
+        dX[lasts] = d_lasts
+        return dX
+
     def backprop_reduce_sum(self, d_sums: Floats2d, lengths: Ints1d) -> Floats2d:
         dX = self.alloc2f(
             lengths.sum(), d_sums.shape[1], dtype=d_sums.dtype, zeros=False
diff --git a/thinc/layers/reduce_first.py b/thinc/layers/reduce_first.py
index df7541315..ab72cb5e3 100644
--- a/thinc/layers/reduce_first.py
+++ b/thinc/layers/reduce_first.py
@@ -1,35 +1,31 @@
-from typing import Callable, Tuple, cast, TypeVar
+from typing import Callable, Tuple, cast
 
 from ..model import Model
 from ..config import registry
-from ..types import Ragged, ArrayXd
+from ..types import Ragged, Floats2d
 from ..util import ArrayInfo
 
-OutT = TypeVar("OutT", bound=ArrayXd)
+
+InT = Ragged
+OutT = Floats2d
 
 
 @registry.layers("reduce_first.v1")
-def reduce_first() -> Model[Ragged, OutT]:
+def reduce_first() -> Model[InT, OutT]:
     """Reduce ragged-formatted sequences to their first element."""
     return Model("reduce_first", forward)
 
 
 def forward(
-    model: Model[Ragged, OutT], Xr: Ragged, is_train: bool
-) -> Tuple[OutT, Callable[[OutT], Ragged]]:
-    starts = model.ops.alloc1i(Xr.lengths.shape[0])
-    starts[1:] += Xr.lengths.cumsum()[:-1]
-    X = Xr.dataXd
-    Y = cast(OutT, X[starts])
-    x_shape = Xr.dataXd.shape
-    lengths = Xr.lengths
+    model: Model[InT, OutT], Xr: InT, is_train: bool
+) -> Tuple[OutT, Callable[[OutT], InT]]:
+    Y, starts_ends = model.ops.reduce_first(cast(Floats2d, Xr.data), Xr.lengths)
 
     array_info = ArrayInfo.from_array(Y)
 
-    def backprop(dY: OutT) -> Ragged:
+    def backprop(dY: OutT) -> InT:
         array_info.check_consistency(dY)
-        dX: OutT = model.ops.alloc(x_shape, dtype=dY.dtype)
-        dX[starts] = dY  # type: ignore[assignment]
-        return Ragged(dX, lengths)
+        dX = model.ops.backprop_reduce_first(dY, starts_ends)
+        return Ragged(dX, Xr.lengths)
 
     return Y, backprop
diff --git a/thinc/layers/reduce_last.py b/thinc/layers/reduce_last.py
index e45a65d12..b8194ec2b 100644
--- a/thinc/layers/reduce_last.py
+++ b/thinc/layers/reduce_last.py
@@ -1,32 +1,29 @@
-from typing import Callable, Tuple, cast, TypeVar
+from typing import Callable, Tuple, cast
 
 from ..model import Model
 from ..config import registry
-from ..types import Ragged, ArrayXd
+from ..types import Ragged, Floats2d
 from ..util import ArrayInfo
 
-OutT = TypeVar("OutT", bound=ArrayXd)
+InT = Ragged
+OutT = Floats2d
 
 
 @registry.layers("reduce_last.v1")
-def reduce_last() -> Model[Ragged, OutT]:
+def reduce_last() -> Model[InT, OutT]:
     """Reduce ragged-formatted sequences to their last element."""
     return Model("reduce_last", forward)
 
 
 def forward(
-    model: Model[Ragged, OutT], Xr: Ragged, is_train: bool
-) -> Tuple[OutT, Callable[[OutT], Ragged]]:
-    ends = Xr.lengths.cumsum() - 1
-    Y = cast(OutT, Xr.dataXd[ends])
-    x_shape = Xr.dataXd.shape
-    lengths = Xr.lengths
+    model: Model[InT, OutT], Xr: InT, is_train: bool
+) -> Tuple[OutT, Callable[[OutT], InT]]:
+    Y, lasts = model.ops.reduce_last(cast(Floats2d, Xr.data), Xr.lengths)
     array_info = ArrayInfo.from_array(Y)
 
-    def backprop(dY: OutT) -> Ragged:
+    def backprop(dY: OutT) -> InT:
         array_info.check_consistency(dY)
-        dX: OutT = model.ops.alloc(x_shape, dtype=dY.dtype)
-        dX[ends] = dY  # type: ignore[assignment]
-        return Ragged(dX, lengths)
+        dX = model.ops.backprop_reduce_last(dY, lasts)
+        return Ragged(dX, Xr.lengths)
 
     return Y, backprop
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index 4463193c2..7de864623 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -261,6 +261,7 @@ def test_maxout(ops, dtype, X):
     ops.xp.testing.assert_allclose(
         ops.xp.take_along_axis(X, ops.xp.expand_dims(which, -1), axis=-1),
         ops.xp.expand_dims(expected_best, -1),
+        atol=1e-10,
     )
 
 
@@ -816,6 +817,68 @@ def test_backprop_fails_with_incorrect_length(ops, dtype):
         )
 
 
+@pytest.mark.parametrize("ops", ALL_OPS)
+@pytest.mark.parametrize("dtype", FLOAT_TYPES)
+def test_reduce_first(ops, dtype):
+    X = ops.asarray2f(
+        [[1.0, 6.0], [2.0, 7.0], [3.0, 8.0], [4.0, 9.0], [5.0, 10.0]], dtype=dtype
+    )
+    lengths = ops.asarray1i([3, 2])
+    Y, starts_ends = ops.reduce_first(X, lengths)
+    ops.xp.testing.assert_array_equal(starts_ends, ops.asarray1i([0, 3, 5]))
+    ops.xp.testing.assert_allclose(Y, [[1.0, 6.0], [4.0, 9.0]])
+
+    lengths = ops.asarray1i([3, 0, 2])
+    with pytest.raises(ValueError, match=r"all sequence lengths must be >= 0"):
+        ops.reduce_last(X, lengths)
+
+    lengths = ops.asarray1i([3, 2, 1])
+    with pytest.raises(IndexError, match=r"lengths must sum up to the number of rows"):
+        ops.reduce_last(X, lengths)
+
+
+@pytest.mark.parametrize("ops", ALL_OPS)
+@pytest.mark.parametrize("dtype", FLOAT_TYPES)
+def test_backprop_reduce_first(ops, dtype):
+    dY = ops.asarray2f([[1.0, 3.0], [2.0, 4.0]], dtype=dtype)
+    starts_ends = ops.asarray1i([0, 3, 5])
+    dX = ops.backprop_reduce_first(dY, starts_ends)
+    ops.xp.testing.assert_allclose(
+        dX, [[1.0, 3.0], [0.0, 0.0], [0.0, 0.0], [2.0, 4.0], [0.0, 0.0]]
+    )
+
+
+@pytest.mark.parametrize("ops", ALL_OPS)
+@pytest.mark.parametrize("dtype", FLOAT_TYPES)
+def test_reduce_last(ops, dtype):
+    X = ops.asarray2f(
+        [[1.0, 6.0], [2.0, 7.0], [3.0, 8.0], [4.0, 9.0], [5.0, 10.0]], dtype=dtype
+    )
+    lengths = ops.asarray1i([3, 2])
+    Y, lasts = ops.reduce_last(X, lengths)
+    ops.xp.testing.assert_array_equal(lasts, ops.asarray1i([2, 4]))
+    ops.xp.testing.assert_allclose(Y, [[3.0, 8.0], [5.0, 10.0]])
+
+    lengths = ops.asarray1i([3, 0, 2])
+    with pytest.raises(ValueError, match=r"all sequence lengths must be >= 0"):
+        ops.reduce_last(X, lengths)
+
+    lengths = ops.asarray1i([3, 2, 1])
+    with pytest.raises(IndexError, match=r"lengths must sum up to the number of rows"):
+        ops.reduce_last(X, lengths)
+
+
+@pytest.mark.parametrize("ops", ALL_OPS)
+@pytest.mark.parametrize("dtype", FLOAT_TYPES)
+def test_backprop_reduce_last(ops, dtype):
+    dY = ops.asarray2f([[1.0, 3.0], [2.0, 4.0]], dtype=dtype)
+    lasts = ops.asarray1i([2, 4])
+    dX = ops.backprop_reduce_last(dY, lasts)
+    ops.xp.testing.assert_allclose(
+        dX, [[0.0, 0.0], [0.0, 0.0], [1.0, 3.0], [0.0, 0.0], [2.0, 4.0]]
+    )
+
+
 @pytest.mark.parametrize("ops", ALL_OPS)
 @pytest.mark.parametrize("dtype", FLOAT_TYPES)
 def test_reduce_max_sm(ops, dtype):
diff --git a/website/docs/api-backends.md b/website/docs/api-backends.md
index 5b62c02b1..f2cdb03ce 100644
--- a/website/docs/api-backends.md
+++ b/website/docs/api-backends.md
@@ -1213,6 +1213,82 @@ Backpropagate the hard Swish MobileNet activation.
 | `inplace`   | <tt>bool</tt>     | If `True`, the `dY` array is modified in place. |
 | **RETURNS** | <tt>FloatsXd</tt> | The gradient of the input.                      |
 
+### Ops.reduce_first {#reduce_first tag="method"}
+
+<inline-list>
+
+- **default:** <i name="yes"></i>
+- **numpy:** default
+- **cupy:** default
+
+</inline-list>
+
+Perform sequence-wise first pooling for data in the ragged format. Zero-length
+sequences are not allowed. A `ValueError` is raised if any element in `lengths`
+is zero.
+
+| Argument    | Type                            | Description                                                           |
+| ----------- | ------------------------------- | --------------------------------------------------------------------- |
+| `X`         | <tt>Floats2d</tt>               | The concatenated sequences.                                           |
+| `lengths`   | <tt>Ints1d</tt>                 | The sequence lengths.                                                 |
+| **RETURNS** | <tt>Tuple[Floats2d,Ints1d]</tt> | The first vector of each sequence and the sequence start/end indices. |
+
+### Ops.backprop_reduce_first {#backprop_reduce_first tag="method"}
+
+<inline-list>
+
+- **default:** <i name="yes"></i>
+- **numpy:** default
+- **cupy:** default
+
+</inline-list>
+
+Backpropagate the `reduce_first` operation.
+
+| Argument      | Type              | Description                                 |
+| ------------- | ----------------- | ------------------------------------------- |
+| `d_firsts`    | <tt>Floats2d</tt> | The gradient of the outputs.                |
+| `starts_ends` | <tt>Ints1d</tt>   | The sequence start/end indices.             |
+| **RETURNS**   | <tt>Floats2d</tt> | The gradient of the concatenated sequences. |
+
+### Ops.reduce_last {#reduce_last tag="method"}
+
+<inline-list>
+
+- **default:** <i name="yes"></i>
+- **numpy:** default
+- **cupy:** default
+
+</inline-list>
+
+Perform sequence-wise last pooling for data in the ragged format. Zero-length
+sequences are not allowed. A `ValueError` is raised if any element in `lengths`
+is zero.
+
+| Argument    | Type                            | Description                                                                     |
+| ----------- | ------------------------------- | ------------------------------------------------------------------------------- |
+| `X`         | <tt>Floats2d</tt>               | The concatenated sequences.                                                     |
+| `lengths`   | <tt>Ints1d</tt>                 | The sequence lengths.                                                           |
+| **RETURNS** | <tt>Tuple[Floats2d,Ints1d]</tt> | The last vector of each sequence and the indices of the last sequence elements. |
+
+### Ops.backprop_reduce_last {#backprop_reduce_last tag="method"}
+
+<inline-list>
+
+- **default:** <i name="yes"></i>
+- **numpy:** default
+- **cupy:** default
+
+</inline-list>
+
+Backpropagate the `reduce_last` operation.
+
+| Argument    | Type              | Description                                 |
+| ----------- | ----------------- | ------------------------------------------- |
+| `d_lasts`   | <tt>Floats2d</tt> | The gradient of the outputs.                |
+| `lasts`     | <tt>Ints1d</tt>   | Indices of the last sequence elements.      |
+| **RETURNS** | <tt>Floats2d</tt> | The gradient of the concatenated sequences. |
+
 ### Ops.reduce_sum {#reduce_sum tag="method"}
 
 <inline-list>

From 671d01ebacff18e8ece77c72068fc4b35ba91943 Mon Sep 17 00:00:00 2001
From: kadarakos <kadar.akos@gmail.com>
Date: Thu, 7 Jul 2022 10:52:28 +0200
Subject: [PATCH 41/78] Label smooth threshold fix (#707)

* correcting label smoothing param contraint

* test new label smooth validation error

* less than 0 input validation

* string concat

* small update to error msg

* fix max smoothing coefficient

* double check error message

* Update thinc/util.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* test error message fix

Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 thinc/tests/test_util.py | 10 ++++++++++
 thinc/util.py            | 19 +++++++++++++------
 2 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/thinc/tests/test_util.py b/thinc/tests/test_util.py
index 715d381d5..aa8ca3c81 100644
--- a/thinc/tests/test_util.py
+++ b/thinc/tests/test_util.py
@@ -115,6 +115,16 @@ def test_to_categorical(label_smoothing):
     ):
         to_categorical(numpy.asarray([0, 0, 0]), label_smoothing=0.01),
 
+    with pytest.raises(
+        ValueError, match=r"label_smoothing parameter"
+    ):
+        to_categorical(numpy.asarray([0, 1, 2, 3, 4]), label_smoothing=0.8)
+
+    with pytest.raises(
+        ValueError, match=r"label_smoothing parameter"
+    ):
+        to_categorical(numpy.asarray([0, 1, 2, 3, 4]), label_smoothing=0.88)
+
 
 def test_convert_recursive():
     is_match = lambda obj: obj == "foo"
diff --git a/thinc/util.py b/thinc/util.py
index e46c62447..42c7f07b8 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -212,16 +212,15 @@ def to_categorical(
     *,
     label_smoothing: float = 0.0,
 ) -> FloatsXd:
-    if not 0.0 <= label_smoothing < 0.5:
-        raise ValueError(
-            "label_smoothing should be greater or "
-            "equal to 0.0 and less than 0.5, "
-            f"but {label_smoothing} was provided."
-        )
 
     if n_classes is None:
         n_classes = int(numpy.max(Y) + 1)  # type: ignore
 
+    if label_smoothing < 0.0:
+        raise ValueError(
+            "Label-smoothing parameter has to be greater than or equal to 0"
+        )
+
     if label_smoothing == 0.0:
         if n_classes == 0:
             raise ValueError("n_classes should be at least 1")
@@ -234,6 +233,14 @@ def to_categorical(
             )
         nongold_prob = label_smoothing / (n_classes - 1)
 
+    max_smooth = (n_classes - 1) / n_classes
+    if n_classes > 1 and label_smoothing >= max_smooth:
+        raise ValueError(
+            f"For {n_classes} classes "
+            "label_smoothing parameter has to be less than "
+            f"{max_smooth}, but found {label_smoothing}."
+        )
+
     xp = get_array_module(Y)
     label_distr = xp.full((n_classes, n_classes), nongold_prob, dtype="float32")
     xp.fill_diagonal(label_distr, 1 - label_smoothing)

From 274c41ccd5b398e887975feb5cf532ab462d5199 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 7 Jul 2022 13:25:29 +0200
Subject: [PATCH 42/78] Set version to v8.1.0 (#718)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 3c12247e3..0394a50c7 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "v8.1.0.dev3"
+__version__ = "v8.1.0"
 __release__ = True

From 9f9e494ba24119b833480e31f53f6cbb825633c1 Mon Sep 17 00:00:00 2001
From: kadarakos <kadar.akos@gmail.com>
Date: Thu, 7 Jul 2022 19:45:57 +0200
Subject: [PATCH 43/78] `get_array_module` with non-array input returns `None`
 (#703)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* if not xp array module is None

* raise error

* update test

* more detailed error

* Update thinc/tests/test_util.py

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>

* Update thinc/util.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update thinc/tests/test_util.py

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
Co-authored-by: svlandeg <svlandeg@github.com>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 thinc/tests/test_util.py | 30 +++++++++++++++++++++++++-----
 thinc/util.py            | 11 +++++++++--
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/thinc/tests/test_util.py b/thinc/tests/test_util.py
index aa8ca3c81..0111cb6cc 100644
--- a/thinc/tests/test_util.py
+++ b/thinc/tests/test_util.py
@@ -3,11 +3,19 @@
 from hypothesis import given
 from thinc.api import get_width, Ragged, Padded
 from thinc.util import get_array_module, is_numpy_array, to_categorical
+from thinc.util import is_cupy_array
 from thinc.util import convert_recursive
 from thinc.types import ArgsKwargs
 
 from . import strategies
 
+ALL_XP = [numpy]
+try:
+    import cupy
+    ALL_XP.append(cupy)
+except ImportError:
+    pass
+
 
 @pytest.mark.parametrize(
     "obj,width",
@@ -39,11 +47,23 @@ def test_get_width_fail(obj):
         get_width(obj)
 
 
-def test_array_module_cpu_gpu_helpers():
-    xp = get_array_module(0)
-    assert hasattr(xp, "ndarray")
-    assert is_numpy_array(numpy.zeros((1, 2)))
-    assert not is_numpy_array((1, 2))
+@pytest.mark.parametrize("xp", ALL_XP)
+def test_array_module_cpu_gpu_helpers(xp):
+    error = ("Only numpy and cupy arrays are supported"
+             ", but found <class 'int'> instead. If "
+             "get_array_module module wasn't called "
+             "directly, this might indicate a bug in Thinc.")
+    with pytest.raises(ValueError, match=error):
+        get_array_module(0)
+    zeros = xp.zeros((1, 2))
+    xp_ = get_array_module(zeros)
+    assert xp_ == xp
+    if xp == numpy:
+        assert is_numpy_array(zeros)
+        assert not is_numpy_array((1, 2))
+    else:
+        assert is_cupy_array(zeros)
+        assert not is_cupy_array((1, 2))
 
 
 @given(
diff --git a/thinc/util.py b/thinc/util.py
index 42c7f07b8..7add0aaf7 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -47,10 +47,17 @@ def get_torch_default_device() -> "torch.device":
 
 
 def get_array_module(arr):  # pragma: no cover
-    if is_cupy_array(arr):
+    if is_numpy_array(arr):
+        return numpy
+    elif is_cupy_array(arr):
         return cupy
     else:
-        return numpy
+        raise ValueError(
+            "Only numpy and cupy arrays are supported"
+            f", but found {type(arr)} instead. If "
+            "get_array_module module wasn't called "
+            "directly, this might indicate a bug in Thinc."
+        )
 
 
 def gpu_is_available():

From eb5c38b9ef636363b442f0c7c276b9cf7291254a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 8 Jul 2022 10:39:51 +0200
Subject: [PATCH 44/78] Update build constraints and requirements for aarch64
 wheels (#722)

* Extend build constraints for aarch64

* Skip mypy for aarch64
---
 build-constraints.txt | 6 ++++--
 requirements.txt      | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/build-constraints.txt b/build-constraints.txt
index cf5fe3284..956973abf 100644
--- a/build-constraints.txt
+++ b/build-constraints.txt
@@ -1,6 +1,8 @@
 # build version constraints for use with wheelwright + multibuild
-numpy==1.15.0; python_version<='3.7'
-numpy==1.17.3; python_version=='3.8'
+numpy==1.15.0; python_version<='3.7' and platform_machine!='aarch64'
+numpy==1.19.2; python_version<='3.7' and platform_machine=='aarch64'
+numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
+numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
 numpy==1.19.3; python_version=='3.9'
 numpy==1.21.3; python_version=='3.10'
 numpy; python_version>='3.11'
diff --git a/requirements.txt b/requirements.txt
index 02817ca87..fb5fa7b51 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,7 +22,7 @@ pytest-cov>=2.7.0,<2.8.0
 coverage>=5.0.0,<6.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.5.0,<3.6.0
-mypy>=0.901,<0.960
+mypy>=0.901,<0.960; platform_machine!='aarch64'
 types-mock>=0.1.1
 types-contextvars>=0.1.2; python_version < "3.7"
 types-dataclasses>=0.1.3; python_version < "3.7"

From 5508f53586275a2c99290c2efdff893b0ee953ae Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 8 Jul 2022 10:47:31 +0200
Subject: [PATCH 45/78] Auto-format code with black (#723)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 thinc/tests/conftest.py  |  3 ++-
 thinc/tests/test_util.py | 19 +++++++++----------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/thinc/tests/conftest.py b/thinc/tests/conftest.py
index 14d765961..19b5137d3 100644
--- a/thinc/tests/conftest.py
+++ b/thinc/tests/conftest.py
@@ -3,9 +3,10 @@
 
 # Functionally disable deadline settings for tests
 # to prevent spurious test failures in CI builds.
-settings.register_profile("no_deadlines", deadline=2*60*1000) # in ms
+settings.register_profile("no_deadlines", deadline=2 * 60 * 1000)  # in ms
 settings.load_profile("no_deadlines")
 
+
 def pytest_sessionstart(session):
     # If Tensorflow is installed, attempt to enable memory growth
     # to prevent it from allocating all of the GPU's free memory
diff --git a/thinc/tests/test_util.py b/thinc/tests/test_util.py
index 0111cb6cc..8d2d0058d 100644
--- a/thinc/tests/test_util.py
+++ b/thinc/tests/test_util.py
@@ -12,6 +12,7 @@
 ALL_XP = [numpy]
 try:
     import cupy
+
     ALL_XP.append(cupy)
 except ImportError:
     pass
@@ -49,10 +50,12 @@ def test_get_width_fail(obj):
 
 @pytest.mark.parametrize("xp", ALL_XP)
 def test_array_module_cpu_gpu_helpers(xp):
-    error = ("Only numpy and cupy arrays are supported"
-             ", but found <class 'int'> instead. If "
-             "get_array_module module wasn't called "
-             "directly, this might indicate a bug in Thinc.")
+    error = (
+        "Only numpy and cupy arrays are supported"
+        ", but found <class 'int'> instead. If "
+        "get_array_module module wasn't called "
+        "directly, this might indicate a bug in Thinc."
+    )
     with pytest.raises(ValueError, match=error):
         get_array_module(0)
     zeros = xp.zeros((1, 2))
@@ -135,14 +138,10 @@ def test_to_categorical(label_smoothing):
     ):
         to_categorical(numpy.asarray([0, 0, 0]), label_smoothing=0.01),
 
-    with pytest.raises(
-        ValueError, match=r"label_smoothing parameter"
-    ):
+    with pytest.raises(ValueError, match=r"label_smoothing parameter"):
         to_categorical(numpy.asarray([0, 1, 2, 3, 4]), label_smoothing=0.8)
 
-    with pytest.raises(
-        ValueError, match=r"label_smoothing parameter"
-    ):
+    with pytest.raises(ValueError, match=r"label_smoothing parameter"):
         to_categorical(numpy.asarray([0, 1, 2, 3, 4]), label_smoothing=0.88)
 
 

From 3dcd03d975f5cde59b8203310ff1ac07ba005304 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 8 Jul 2022 11:07:52 +0200
Subject: [PATCH 46/78] Fix version string (#724)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 0394a50c7..9388517fe 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "v8.1.0"
+__version__ = "8.1.0"
 __release__ = True

From 17846c45ed5dab31988a99a49e012d06bbfeb68a Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 8 Jul 2022 11:13:33 +0200
Subject: [PATCH 47/78] Extend to mypy<0.970 (#725)

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index fb5fa7b51..05d77e403 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,7 +22,7 @@ pytest-cov>=2.7.0,<2.8.0
 coverage>=5.0.0,<6.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.5.0,<3.6.0
-mypy>=0.901,<0.960; platform_machine!='aarch64'
+mypy>=0.901,<0.970; platform_machine!='aarch64'
 types-mock>=0.1.1
 types-contextvars>=0.1.2; python_version < "3.7"
 types-dataclasses>=0.1.3; python_version < "3.7"

From 2da00a39e3971ed63eeb497aa455e49a3a4bfccb Mon Sep 17 00:00:00 2001
From: Christian Clauss <cclauss@me.com>
Date: Thu, 14 Jul 2022 00:40:08 +0200
Subject: [PATCH 48/78] Fix typo

---
 thinc/types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/types.py b/thinc/types.py
index 04b81946a..f44a64e80 100644
--- a/thinc/types.py
+++ b/thinc/types.py
@@ -316,7 +316,7 @@ class Floats1d(_Array1d, _Floats):
 
     @classmethod
     def __get_validators__(cls):
-        """Runtine validation for pydantic."""
+        """Runtime validation for pydantic."""
         yield lambda v: validate_array(v, ndim=1, dtype="f")
 
     def __iter__(self) -> Iterator[float]: ...

From 40c129f6ee51d4060f1af6b29f903ce51fd89736 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 18 Jul 2022 17:07:42 +0200
Subject: [PATCH 49/78] Update build constraints for arm64 and aarch64 wheels
 (#716)

---
 website/docs/install.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/website/docs/install.md b/website/docs/install.md
index 70ff24fc3..14f5d7bd7 100644
--- a/website/docs/install.md
+++ b/website/docs/install.md
@@ -47,9 +47,8 @@ is_gpu = prefer_gpu()
 ### Using build constraints when compiling from source
 
 If you install Thinc from source or with `pip` for platforms where there are not
-binary wheels on PyPI (currently any non-`x86_64` platforms, so commonly Linux
-`aarch64` or OS X M1/`arm64`), you may need to use build constraints if any
-package in your environment requires an older version of `numpy`.
+binary wheels on PyPI, you may need to use build constraints if any package
+in your environment requires an older version of `numpy`.
 
 If `numpy` gets downgraded from the most recent release at any point after
 you've compiled `thinc`, you might see an error that looks like this:
@@ -72,9 +71,9 @@ pip install thinc --no-cache-dir
 ```
 
 Our build constraints currently specify the oldest supported `numpy` available
-on PyPI for `x86_64`. Depending on your platform and environment, you may want
-to customize the specific versions of `numpy`. For other platforms, you can have
-a look at SciPy's
+on PyPI for `x86_64` and `aarch64`. Depending on your platform and environment,
+you may want to customize the specific versions of `numpy`. For other
+platforms, you can have a look at SciPy's
 [`oldest-supported-numpy`](https://github.com/scipy/oldest-supported-numpy/blob/main/setup.cfg)
 package to see what the oldest recommended versions of `numpy` are.
 

From 5a4f868c0ce275523213be5d1c5681d6a84f54ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 28 Jul 2022 13:25:21 +0200
Subject: [PATCH 50/78] Ops: replace FloatsType by constrained typevar (#720)

* Ops: replace FloatsType by constrained typevar

Ops used the `FloatsType`, which had `FloatsXd` as its bound. MyPy could
not infer that code such as the following is correct,

```
def dish(self, X: FloatsType, inplace: bool = False) -> FloatsType:
    tmp = X * X
    # ...
```

because the inferred type is the union (or a subtype). If we instead
constrain the type variable as follows:

```
FloatsType = TypeVar("FloatsType",
    Floats1d, Floats2d, Floats3d, Floats4d)
```

the type paramater will be instantiated with a single concrete type,
solving such issues.

* Remove a bunch of casts and ignores that are not necessary anymore
---
 thinc/backends/ops.py              | 144 ++++++++++++++---------------
 thinc/layers/sigmoid_activation.py |  16 ++--
 thinc/types.py                     |   1 +
 3 files changed, 80 insertions(+), 81 deletions(-)

diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index 6b2eb36a9..c9fb10aae 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -1,6 +1,6 @@
 import math
 
-from typing import Optional, List, Tuple, Sequence, Union, cast, TypeVar
+from typing import Optional, List, Tuple, Sequence, Type, Union, cast, TypeVar
 from typing import Iterator, overload
 import numpy
 import itertools
@@ -9,6 +9,7 @@
 from ..types import Floats1d, Floats2d, Floats3d, Floats4d
 from ..types import Array1d, Array2d, Array3d, Array4d, ListXd
 from ..types import FloatsXd, Ints1d, Ints2d, Ints3d, Ints4d, IntsXd, _Floats
+from ..types import FloatsXdT
 from ..types import DeviceTypes, Generator, Padded, Batchable, SizedGenerator
 from ..util import get_array_module, is_xp_array, to_numpy
 
@@ -16,7 +17,6 @@
 
 ArrayT = TypeVar("ArrayT", bound=ArrayXd)
 FloatsT = TypeVar("FloatsT", bound=_Floats)
-FloatsType = TypeVar("FloatsType", bound=FloatsXd)
 SQRT2PI = math.sqrt(2.0 / math.pi)
 INV_SQRT2 = 1.0 / math.sqrt(2.0)
 INV_SQRT_2PI = 1.0 / math.sqrt(2.0 * math.pi)
@@ -721,29 +721,29 @@ def as_contig(self, data: ArrayT, dtype: Optional[DTypes] = None) -> ArrayT:
         kwargs = {"dtype": dtype} if dtype is not None else {}
         return self.xp.ascontiguousarray(data, **kwargs)
 
-    def sigmoid(self, X: FloatsType, *, inplace: bool = False) -> FloatsType:
+    def sigmoid(self, X: FloatsXdT, *, inplace: bool = False) -> FloatsXdT:
         if inplace:
             # To prevent overflows and help with regularization/numerical stability
             X = self.xp.clip(X, -20.0, 20.0, out=X)
             self.xp.exp(-X, out=X)
-            X += 1.0  # type: ignore[assignment]
-            X **= -1.0  # type: ignore[assignment]
-            return cast(FloatsType, X)
+            X += 1.0
+            X **= -1.0
+            return X
         else:
             X = self.xp.clip(X, -20.0, 20.0)
-            return cast(FloatsType, 1.0 / (1.0 + self.xp.exp(-X)))
+            return 1.0 / (1.0 + self.xp.exp(-X))
 
     def backprop_sigmoid(
-        self, dY: FloatsType, Y: FloatsType, *, inplace: bool = False
-    ) -> FloatsType:
+        self, dY: FloatsXdT, Y: FloatsXdT, *, inplace: bool = False
+    ) -> FloatsXdT:
         if inplace:
             self.dsigmoid(Y, inplace=True)
-            Y *= dY  # type: ignore
+            Y *= dY
             return Y
         else:
-            return dY * self.dsigmoid(Y, inplace=inplace)  # type: ignore
+            return dY * self.dsigmoid(Y, inplace=inplace)
 
-    def dsigmoid(self, Y: FloatsType, *, inplace: bool = False) -> FloatsType:
+    def dsigmoid(self, Y: FloatsXdT, *, inplace: bool = False) -> FloatsXdT:
         if inplace:
             Y *= 1 - Y
             return Y
@@ -864,30 +864,30 @@ def backprop_relu(
 
     def clipped_linear(
         self,
-        X: FloatsType,
+        X: FloatsXdT,
         slope: float = 1.0,
         offset: float = 0.0,
         min_val: float = 0.0,
         max_val: float = 1.0,
         inplace: bool = False,
-    ) -> FloatsType:
+    ) -> FloatsXdT:
         if inplace:
-            X *= slope  # type: ignore[assignment]
-            X += offset  # type: ignore[assignment]
-            return cast(FloatsType, self.xp.clip(X, min_val, max_val, out=X))
-        out = X * slope + offset  # type: ignore[assignment]
-        return cast(FloatsType, self.xp.clip(out, min_val, max_val))
+            X *= slope
+            X += offset
+            return self.xp.clip(X, min_val, max_val, out=X)
+        out = X * slope + offset
+        return self.xp.clip(out, min_val, max_val)
 
     def backprop_clipped_linear(
         self,
-        dY: FloatsType,
-        X: FloatsType,
+        dY: FloatsXdT,
+        X: FloatsXdT,
         slope: float = 1.0,
         offset: float = 0.0,
         min_val: float = 0.0,
         max_val: float = 1.0,
         inplace: bool = False,
-    ) -> FloatsType:
+    ) -> FloatsXdT:
         low = (min_val - offset) / slope
         high = (max_val - offset) / slope
         slope = self.xp.float64(slope).astype(X.dtype)
@@ -898,60 +898,58 @@ def backprop_clipped_linear(
             return dY
         return dY * dX
 
-    def relu_k(
-        self, X: FloatsType, n: float = 6.0, inplace: bool = False
-    ) -> FloatsType:
+    def relu_k(self, X: FloatsXdT, n: float = 6.0, inplace: bool = False) -> FloatsXdT:
         return self.clipped_linear(X, max_val=n, inplace=inplace)
 
     def backprop_relu_k(
-        self, dY: FloatsType, X: FloatsType, n: float = 6.0, inplace: bool = False
-    ) -> FloatsType:
+        self, dY: FloatsXdT, X: FloatsXdT, n: float = 6.0, inplace: bool = False
+    ) -> FloatsXdT:
         return self.backprop_clipped_linear(dY, X, max_val=n, inplace=inplace)
 
-    def hard_sigmoid(self, X: FloatsType, inplace: bool = False) -> FloatsType:
+    def hard_sigmoid(self, X: FloatsXdT, inplace: bool = False) -> FloatsXdT:
         return self.clipped_linear(X, slope=0.2, offset=0.5, inplace=inplace)
 
     def backprop_hard_sigmoid(
-        self, dY: FloatsType, X: FloatsType, inplace: bool = False
-    ) -> FloatsType:
+        self, dY: FloatsXdT, X: FloatsXdT, inplace: bool = False
+    ) -> FloatsXdT:
         return self.backprop_clipped_linear(dY, X, slope=0.2, offset=0.5)
 
-    def hard_tanh(self, X: FloatsType, inplace: bool = False) -> FloatsType:
+    def hard_tanh(self, X: FloatsXdT, inplace: bool = False) -> FloatsXdT:
         return self.clipped_linear(X, min_val=-1.0, max_val=1.0, inplace=inplace)
 
     def backprop_hard_tanh(
-        self, dY: FloatsType, X: FloatsType, inplace: bool = False
-    ) -> FloatsType:
+        self, dY: FloatsXdT, X: FloatsXdT, inplace: bool = False
+    ) -> FloatsXdT:
         return self.backprop_clipped_linear(dY, X, min_val=-1.0, max_val=1.0)
 
-    def swish(self, X: FloatsType, inplace: bool = False) -> FloatsType:
+    def swish(self, X: FloatsXdT, inplace: bool = False) -> FloatsXdT:
         if inplace:
-            X *= self.sigmoid(X)  # type: ignore[operator, assignment]
-            return cast(FloatsType, X)
-        out = X * self.sigmoid(X)  # type: ignore[operator]
-        return cast(FloatsType, out)
+            X *= self.sigmoid(X)
+            return X
+        out = X * self.sigmoid(X)
+        return out
 
     def backprop_swish(
-        self, dY: FloatsType, X: FloatsType, Y: FloatsType, inplace: bool = False
-    ) -> FloatsType:
-        Y = Y + self.sigmoid(X) * (1 - Y)  # type: ignore[operator]
+        self, dY: FloatsXdT, X: FloatsXdT, Y: FloatsXdT, inplace: bool = False
+    ) -> FloatsXdT:
+        Y = Y + self.sigmoid(X) * (1 - Y)
         if inplace:
-            dY *= Y  # type: ignore[operator, assignment]
-            return cast(FloatsType, dY)
-        out = dY * Y  # type: ignore[operator]
-        return cast(FloatsType, out)
+            dY *= Y
+            return dY
+        out = dY * Y
+        return out
 
     # Following https://www.scitepress.org/Papers/2019/74696/74696.pdf
-    def hard_swish(self, X: FloatsType, inplace: bool = False) -> FloatsType:
+    def hard_swish(self, X: FloatsXdT, inplace: bool = False) -> FloatsXdT:
         if inplace:
-            X *= self.hard_sigmoid(X)  # type: ignore[operator, assignment]
-            return cast(FloatsType, X)
-        out = X * self.hard_sigmoid(X)  # type: ignore[operator]
-        return cast(FloatsType, out)
+            X *= self.hard_sigmoid(X)
+            return X
+        out = X * self.hard_sigmoid(X)
+        return out
 
     def backprop_hard_swish(
-        self, dY: FloatsType, X: FloatsType, inplace: bool = False
-    ) -> FloatsType:
+        self, dY: FloatsXdT, X: FloatsXdT, inplace: bool = False
+    ) -> FloatsXdT:
         dX = X * 0.4 + 0.5
         dX[X > 2.5] = 1.0
         dX[X < -2.5] = 0
@@ -961,15 +959,15 @@ def backprop_hard_swish(
         return dY * dX
 
     # From https://arxiv.org/pdf/1905.02244v5.pdf
-    def hard_swish_mobilenet(self, X: FloatsType, inplace: bool = False) -> FloatsType:
+    def hard_swish_mobilenet(self, X: FloatsXdT, inplace: bool = False) -> FloatsXdT:
         if inplace:
             X *= self.relu_k(X + 3) / 6
             return X
         return X * (self.relu_k(X + 3) / 6)
 
     def backprop_hard_swish_mobilenet(
-        self, dY: FloatsType, X: FloatsType, inplace: bool = False
-    ) -> FloatsType:
+        self, dY: FloatsXdT, X: FloatsXdT, inplace: bool = False
+    ) -> FloatsXdT:
         dX = (1 / 6) * (X * 2.0 + 3.0)
         dX[X > 3.0] = 1.0
         dX[X < -3.0] = 0
@@ -980,7 +978,7 @@ def backprop_hard_swish_mobilenet(
 
     # Code snippet taken from:
     # https://www.johndcook.com/blog/2009/01/19/stand-alone-error-function-erf/
-    def erf(self, X: FloatsType) -> FloatsType:
+    def erf(self, X: FloatsXdT) -> FloatsXdT:
         # save the sign of x
         sign = self.xp.sign(X)
         X = self.xp.abs(X)
@@ -1000,12 +998,12 @@ def erf(self, X: FloatsType) -> FloatsType:
         out = out.astype(X.dtype)
         return out
 
-    def sechsq(self, X: FloatsType) -> FloatsType:
+    def sechsq(self, X: FloatsXdT) -> FloatsXdT:
         # Avoid overflow in cosh. Clipping at |20| has an error of 1.7e-17.
         X = self.xp.clip(X, -20.0, 20.0)
         return (1 / self.xp.cosh(X)) ** 2
 
-    def gelu_approx(self, X: FloatsType, inplace: bool = False) -> FloatsType:
+    def gelu_approx(self, X: FloatsXdT, inplace: bool = False) -> FloatsXdT:
         tmp = 1.0 + self.xp.tanh(SQRT2PI * (X + 0.044715 * self.xp.power(X, 3)))
         tmp *= 0.5
         tmp = tmp.astype(X.dtype)
@@ -1018,9 +1016,9 @@ def gelu_approx(self, X: FloatsType, inplace: bool = False) -> FloatsType:
             return Y
 
     def backprop_gelu_approx(
-        self, dY: FloatsType, X: FloatsType, inplace: bool = False
-    ) -> FloatsType:
-        dX = self.alloc_f(X.shape)
+        self, dY: FloatsXdT, X: FloatsXdT, inplace: bool = False
+    ) -> FloatsXdT:
+        dX = cast(FloatsXdT, self.alloc_f(X.shape))
         Xp3 = self.xp.power(X, 3)
         tmp = 0.5 * self.xp.tanh(0.0356774 * Xp3 + 0.797885 * X)
         tmp += (0.0535161 * Xp3 + 0.398942 * X) * self.sechsq(
@@ -1033,27 +1031,27 @@ def backprop_gelu_approx(
             return dY
         return dY * dX
 
-    def gelu(self, X: FloatsType, inplace: bool = False) -> FloatsType:
+    def gelu(self, X: FloatsXdT, inplace: bool = False) -> FloatsXdT:
         # GELU(x) = x · Φ(x)
         cdf = gaussian_cdf(self, X)
         if inplace:
-            X *= cdf  # type: ignore[operator, assignment]
+            X *= cdf
             return X
-        return X * cdf  # type: ignore[operator, return-value]
+        return X * cdf
 
     def backprop_gelu(
-        self, dY: FloatsType, X: FloatsType, inplace: bool = False
-    ) -> FloatsType:
+        self, dY: FloatsXdT, X: FloatsXdT, inplace: bool = False
+    ) -> FloatsXdT:
         # GELU'(x) = Φ(x) + x · PDF(x)
-        dX = gaussian_cdf(self, X) + X * gaussian_pdf(self, X)  # type: ignore[operator]
+        dX = gaussian_cdf(self, X) + X * gaussian_pdf(self, X)
         if inplace:
             dY *= dX
             return dY
         return dY * dX
 
     def mish(
-        self, X: FloatsType, threshold: float = 20.0, inplace: bool = False
-    ) -> FloatsType:
+        self, X: FloatsXdT, threshold: float = 20.0, inplace: bool = False
+    ) -> FloatsXdT:
         tmp = X * self.xp.tanh(self.xp.log(1.0 + self.xp.exp(X)))
         Y = self.xp.where(X >= threshold, X, tmp)
         if inplace:
@@ -1064,11 +1062,11 @@ def mish(
 
     def backprop_mish(
         self,
-        dY: FloatsType,
+        dY: FloatsXdT,
         X: Floats2d,
         threshold: float = 20.0,
         inplace: bool = False,
-    ) -> FloatsType:
+    ) -> FloatsXdT:
         if dY.shape != X.shape:
             msg = f"arrays have incompatible shapes: {dY.shape} and {X.shape}"
             raise ValueError(msg)
@@ -1614,12 +1612,12 @@ def dtanh(Y: ArrayT) -> ArrayT:
     return 1 - Y**2
 
 
-def gaussian_cdf(ops: Ops, X: FloatsType) -> FloatsType:
+def gaussian_cdf(ops: Ops, X: FloatsXdT) -> FloatsXdT:
     """Gaussian CDF for distribution with mean 0 and stdev 1."""
     return 0.5 * (1.0 + ops.erf(INV_SQRT2 * X))
 
 
-def gaussian_pdf(ops: Ops, X: FloatsType) -> FloatsType:
+def gaussian_pdf(ops: Ops, X: FloatsXdT) -> FloatsXdT:
     """Gaussian PDF for distribution with mean 0 and stdev 1."""
     return INV_SQRT_2PI * ops.xp.exp(-0.5 * X * X)
 
diff --git a/thinc/layers/sigmoid_activation.py b/thinc/layers/sigmoid_activation.py
index 8b3982aea..b87261075 100644
--- a/thinc/layers/sigmoid_activation.py
+++ b/thinc/layers/sigmoid_activation.py
@@ -2,23 +2,23 @@
 
 from ..model import Model
 from ..config import registry
-from ..types import FloatsXd
-
-
-InT = TypeVar("InT", bound=FloatsXd)
+from ..types import FloatsXdT
 
 
 @registry.layers("sigmoid_activation.v1")
-def sigmoid_activation() -> Model[InT, InT]:
+def sigmoid_activation() -> Model[FloatsXdT, FloatsXdT]:
     return Model("sigmoid_activation", forward)
 
 
-def forward(model: Model[InT, InT], X: InT, is_train: bool) -> Tuple[InT, Callable]:
+def forward(
+    model: Model[FloatsXdT, FloatsXdT], X: FloatsXdT, is_train: bool
+) -> Tuple[FloatsXdT, Callable]:
     Y = model.ops.sigmoid(X, inplace=False)
 
-    def backprop(dY: InT) -> InT:
+    def backprop(dY: FloatsXdT) -> FloatsXdT:
         return cast(
-            InT, dY * model.ops.dsigmoid(Y, inplace=False)  # type:ignore[operator]
+            FloatsXdT,
+            dY * model.ops.dsigmoid(Y, inplace=False),  # type:ignore[operator]
         )
 
     return Y, backprop
diff --git a/thinc/types.py b/thinc/types.py
index f44a64e80..69a51bcc7 100644
--- a/thinc/types.py
+++ b/thinc/types.py
@@ -46,6 +46,7 @@
 ArrayT = TypeVar("ArrayT")
 SelfT = TypeVar("SelfT")
 Array1dT = TypeVar("Array1dT", bound="Array1d")
+FloatsXdT = TypeVar("FloatsXdT", "Floats1d", "Floats2d", "Floats3d", "Floats4d")
 
 # These all behave the same as far as indexing is concerned
 Slicish = Union[slice, List[int], "ArrayXd"]

From 8e5c743aa91236b73ceb8e037a8b9e175ffbd17a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 28 Jul 2022 14:47:47 +0200
Subject: [PATCH 51/78] Unroll `argmax` in `maxout` for small sizes of `P`
 (#702)

* Unroll `argmax` in `maxout` for small sizes of `P`

`maxout` uses the `argmax` function to determine the index of the
maximum value of each `P` inputs. `argmax` uses a generic array loop,
which impedes speculative execution and `could` also prevent unrolling
of the outer `maxout` loop.

This change unrolls `argmax` with small values of `P` using a variadic
template. This leads to a small performance improvement.

* Unmodernize struct initialization
---
 setup.py                      |  2 +-
 thinc/backends/cpu_kernels.hh | 74 ++++++++++++++++++++++++++++++-----
 2 files changed, 65 insertions(+), 11 deletions(-)

diff --git a/setup.py b/setup.py
index 4e5d056cc..27873beeb 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@
 ]
 COMPILE_OPTIONS = {
     "msvc": ["/Ox", "/EHsc"],
-    "other": ["-O3", "-Wno-strict-prototypes", "-Wno-unused-function"],
+    "other": ["-O3", "-Wno-strict-prototypes", "-Wno-unused-function", "-std=c++11"],
 }
 COMPILER_DIRECTIVES = {
     "language_level": -3,
diff --git a/thinc/backends/cpu_kernels.hh b/thinc/backends/cpu_kernels.hh
index 1c1f5ac8f..95808eee1 100644
--- a/thinc/backends/cpu_kernels.hh
+++ b/thinc/backends/cpu_kernels.hh
@@ -27,23 +27,58 @@ struct axpy {
 
 // All elementwise functions, such as most activations, work in-place.
 
-template <typename A, typename L>
-L argmax(A* arr, L len)
+
+template <typename T, typename L>
+struct argmax_result {
+    T max;
+    L max_idx;
+};
+
+template <typename T, typename L>
+argmax_result<T, L> argmax(T const *arr, L len)
 {
-    static_assert(std::is_floating_point<A>::value,
+    static_assert(std::is_floating_point<T>::value,
         "Array should be floating point");
     static_assert(std::is_integral<L>::value, "Array length should be integral");
 
-    L max = 0;
+    argmax_result<T, L> r { arr[0],  0 };
+
     for (L i = 1; i < len; ++i) {
-        if (arr[i] > arr[max]) {
-            max = i;
+        if (arr[i] > r.max) {
+            r.max = arr[i];
+            r.max_idx = i;
         }
     }
 
-    return max;
+    return r;
+}
+
+// The next two templates define argmax for a fixed number of elements.
+
+template <typename T, typename L>
+argmax_result<T, L> argmax(T a) {
+    static_assert(std::is_floating_point<T>::value, "Argument should be floating point");
+    argmax_result<T, L> acc { a, 0 };
+    return acc;
+}
+
+template<typename T, typename L, typename... Args>
+argmax_result<T, L> argmax(T a, Args... args) {
+    static_assert(std::is_floating_point<T>::value, "Arguments should be floating point");
+
+    auto acc = argmax<T, L>(args...);
+
+    if (acc.max > a) {
+        acc.max_idx += 1;
+    } else {
+        acc.max_idx = 0;
+        acc.max = a;
+    }
+
+    return acc;
 }
 
+
 template <typename A, typename L>
 void vec_add(A* X, const A* Y, A scale, L N)
 {
@@ -62,12 +97,31 @@ void cpu_maxout(A* best__bo, L* which__bo, const A* cands__bop, L B, L O, L P)
         "Array should be floating point");
     static_assert(std::is_integral<L>::value, "Array length should be integral");
 
-    for (int i = 0; i < B * O; ++i) {
-        which__bo[i] = argmax(cands__bop + i * P, P);
-        best__bo[i] = cands__bop[i * P + which__bo[i]];
+    // For small inputs, we use an unrolled argmax.
+    if (P == 2) {
+        for (int i = 0; i < B * O; ++i) {
+            A const *input = cands__bop + i * P;
+            auto r = argmax<A, L>(input[0], input[1]);
+            which__bo[i] = r.max_idx;
+            best__bo[i] = r.max;
+        }
+    } else if (P == 3) {
+        for (int i = 0; i < B * O; ++i) {
+            A const *input = cands__bop + i * P;
+            auto r = argmax<A, L>(input[0], input[1], input[2]);
+            which__bo[i] = r.max_idx;
+            best__bo[i] = r.max;
+        }
+    } else {
+        for (int i = 0; i < B * O; ++i) {
+            auto r = argmax<A, L>(cands__bop + i * P, P);
+            which__bo[i] = r.max_idx;
+            best__bo[i] = r.max;
+        }
     }
 }
 
+
 template <typename A, typename L>
 void cpu_backprop_maxout(A* dX__bop, const A* dX__bo, const L* which__bo,
     L B, L O, L P)

From 42b73c9ce5a40184d2b3d270feb9d789558aabf8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 3 Aug 2022 13:08:41 +0200
Subject: [PATCH 52/78] Change Docker image tag to thinc-ai (#732)

This is purely a cosmetic change, but less confusing than thinc-io :).
---
 website/Dockerfile | 4 ++--
 website/README.md  | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/website/Dockerfile b/website/Dockerfile
index 747b1f344..b1965b17a 100644
--- a/website/Dockerfile
+++ b/website/Dockerfile
@@ -1,6 +1,6 @@
 FROM node:11.15.0
 
-WORKDIR /thinc-io
+WORKDIR /thinc-ai
 
 RUN npm install -g gatsby-cli@2.7.4
 
@@ -13,4 +13,4 @@ RUN npm install
 # from where a user mounts files, so that they don't accidentally mount
 # their own node_modules from a different build
 # https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
-WORKDIR /thinc-io/website/
+WORKDIR /thinc-ai/website/
diff --git a/website/README.md b/website/README.md
index 1e3b10ce5..f1c4ec5ba 100644
--- a/website/README.md
+++ b/website/README.md
@@ -20,16 +20,16 @@ Rather than installing NPM locally, you can also build a Docker container with
 the prerequisite dependencies:
 
 ```bash
-docker build -t thinc-io .
+docker build -t thinc-ai .
 ```
 
 Afterwards, the website can be built and run in the container:
 
 ```bash
 docker run --rm -it \
-  -v $PWD:/thinc-io/website \
+  -v $PWD:/thinc-ai/website \
   -p 8000:8000 \
-  thinc-io \
+  thinc-ai \
   gatsby develop -H 0.0.0.0
 ```
 

From 69a280fc406ff8694797af95f496b1b14066c6fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Wed, 3 Aug 2022 13:42:05 +0200
Subject: [PATCH 53/78] Add `with_signpost_interval` layer (#711)

* Add with_signpost_interval layer

This layer wraps a layer, adding macOS interval signposts for the
forward and backward pass. These intervals can then be visualized
in the macOS Instruments.app timeline.

* Fix reference in api-layers.md

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

* End message is optional since signpost 0.0.3

* with_signpost_interval: also wrap init callback

* docs: we wrap init as well

* Add documentation fixes

Suggested by @svlandeg.

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 thinc/api.py                           |  1 +
 thinc/compat.py                        |  9 +++++
 thinc/layers/__init__.py               |  2 ++
 thinc/layers/with_signpost_interval.py | 50 ++++++++++++++++++++++++++
 website/docs/api-layers.md             | 43 ++++++++++++++++++++--
 5 files changed, 103 insertions(+), 2 deletions(-)
 create mode 100644 thinc/layers/with_signpost_interval.py

diff --git a/thinc/api.py b/thinc/api.py
index 64ed11703..bdd001587 100644
--- a/thinc/api.py
+++ b/thinc/api.py
@@ -40,6 +40,7 @@
 from .layers import with_reshape, with_getitem, strings2arrays, list2array
 from .layers import list2ragged, ragged2list, list2padded, padded2list, remap_ids
 from .layers import array_getitem, with_cpu, with_debug, with_nvtx_range
+from .layers import with_signpost_interval
 from .layers import tuplify
 
 from .layers import reduce_first, reduce_last, reduce_max, reduce_mean, reduce_sum
diff --git a/thinc/compat.py b/thinc/compat.py
index d04ff97c3..2d8b40345 100644
--- a/thinc/compat.py
+++ b/thinc/compat.py
@@ -78,4 +78,13 @@
     h5py = None
 
 
+try:  # pragma: no cover
+    import os_signpost
+
+    has_os_signpost = True
+except ImportError:
+    os_signpost = None
+    has_os_signpost = False
+
+
 has_gpu = has_cupy_gpu or has_torch_mps_gpu
diff --git a/thinc/layers/__init__.py b/thinc/layers/__init__.py
index b37e38a7a..4c22208fe 100644
--- a/thinc/layers/__init__.py
+++ b/thinc/layers/__init__.py
@@ -71,6 +71,7 @@
 from .with_getitem import with_getitem
 from .with_debug import with_debug
 from .with_nvtx_range import with_nvtx_range
+from .with_signpost_interval import with_signpost_interval
 
 
 __all__ = [
@@ -128,5 +129,6 @@
     "with_flatten",
     "with_debug",
     "with_nvtx_range",
+    "with_signpost_interval",
     "remap_ids",
 ]
diff --git a/thinc/layers/with_signpost_interval.py b/thinc/layers/with_signpost_interval.py
new file mode 100644
index 000000000..9a468d896
--- /dev/null
+++ b/thinc/layers/with_signpost_interval.py
@@ -0,0 +1,50 @@
+from typing import Optional, Callable, Any, Tuple, TypeVar
+
+from ..compat import has_os_signpost, os_signpost
+from ..model import Model
+
+
+_ModelT = TypeVar("_ModelT", bound=Model)
+
+
+def with_signpost_interval(
+    layer: _ModelT,
+    signposter: "os_signpost.Signposter",
+    name: Optional[str] = None,
+) -> _ModelT:
+    """Wraps any layer and marks the init, forward and backprop phases using
+    signpost intervals for macOS Instruments profiling
+
+    By default, the name of the layer is used as the name of the range,
+    followed by the name of the pass.
+    """
+    if not has_os_signpost:
+        raise ValueError(
+            "with_signpost_interval layer requires the 'os_signpost' package"
+        )
+
+    name = layer.name if name is None else name
+
+    orig_forward = layer._func
+    orig_init = layer.init
+
+    def forward(model: Model, X: Any, is_train: bool) -> Tuple[Any, Callable]:
+        with signposter.use_interval(f"{name} forward"):
+            layer_Y, layer_callback = orig_forward(model, X, is_train=is_train)
+
+        def backprop(dY: Any) -> Any:
+            with signposter.use_interval(f"{name} backprop"):
+                return layer_callback(dY)
+
+        return layer_Y, backprop
+
+    def init(_model: Model, X: Any, Y: Any) -> Model:
+        if orig_init is not None:
+            with signposter.use_interval(f"{name} init"):
+                return orig_init(layer, X, Y)
+        else:
+            return layer
+
+    layer.replace_callbacks(forward, init=init)
+
+    return layer
diff --git a/website/docs/api-layers.md b/website/docs/api-layers.md
index 1c43a9d7a..dc1a2be2e 100644
--- a/website/docs/api-layers.md
+++ b/website/docs/api-layers.md
@@ -835,8 +835,8 @@ https://github.com/explosion/thinc/blob/master/thinc/layers/reduce_last.py
 </inline-list>
 
 Pooling layer that reduces the dimensions of the data by selecting the maximum
-value for each feature. A `ValueError` is raised if any element in `lengths`
-is zero.
+value for each feature. A `ValueError` is raised if any element in `lengths` is
+zero.
 
 | Argument    | Type                             | Description                |
 | ----------- | -------------------------------- | -------------------------- |
@@ -1531,6 +1531,45 @@ model.initialize()
 https://github.com/explosion/thinc/blob/master/thinc/layers/with_nvtx_range.py
 ```
 
+### with_signpost_interval {#with_signpost_interval tag="function" new="8.1.1"}
+
+<inline-list>
+
+- **Input:** <tt>Any</tt>
+- **Output:** <tt>Any</tt>
+
+</inline-list>
+
+Layer that wraps any layer and marks the init, forward and backprop passes as a
+(macOS) signpost interval. This can be helpful when profiling the performance of
+a layer using macOS
+[Instruments.app](https://help.apple.com/instruments/mac/current/). Use of this
+layer requires that the
+[`os-signpost`](https://github.com/explosion/os-signpost) package is installed.
+
+```python
+### Example
+from os_signpost import Signposter
+from thinc.api import Linear, with_signpost_interval
+
+signposter = Signposter("com.example.my_subsystem",
+    Signposter.Category.DynamicTracing)
+
+model = with_signpost_interval(Linear(2, 5), signposter)
+model.initialize()
+```
+
+| Argument     | Type                              | Description                                                                     |
+| ------------ | --------------------------------- | ------------------------------------------------------------------------------- |
+| `layer`      | <tt>Model</tt>                    | The layer to wrap.                                                              |
+| `signposter` | <tt>os_signposter.Signposter</tt> | `Signposter` object to log the interval with.                                   |
+| `name`       | <tt>Optional[str]</tt>            | Optional name for the wrapped layer. Defaults to the name of the wrapped layer. |
+| **RETURNS**  | <tt>Model</tt>                    | The wrapped layer.                                                              |
+
+```python
+https://github.com/explosion/thinc/blob/master/thinc/layers/with_signpost_interval.py
+```
+
 ---
 
 ## Wrappers {#wrappers}

From 18468551cec636676a56a9f34fd1861c902296f4 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Thu, 4 Aug 2022 13:11:07 +0200
Subject: [PATCH 54/78] Docs: Fix/update `label_smoothing` description, run
 prettier (#733)

---
 website/docs/api-loss.md |  9 ++++-----
 website/docs/api-util.md | 12 +++++++-----
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/website/docs/api-loss.md b/website/docs/api-loss.md
index 3720a78f5..5ccf84b58 100644
--- a/website/docs/api-loss.md
+++ b/website/docs/api-loss.md
@@ -70,9 +70,8 @@ are expected to be in the range of `[0, 1]`. They can both represent exclusive
 classes from multi-class cross-entropy (generally coming from a `softmax` layer)
 or could be classwise binary decisions for multi-label cross-entropy (`sigmoid`
 layer). The `truths` are most commonly provided as labels in `Ints1d`,
-`List[int]` or `List[str]` format. 
-Alternatively, users can provide `truths` as a `Floats2d` for
-example to encode label-confidences.
+`List[int]` or `List[str]` format. Alternatively, users can provide `truths` as
+a `Floats2d` for example to encode label-confidences.
 
 <grid>
 
@@ -98,7 +97,7 @@ normalize = true
 | `normalize`       | <tt>bool</tt>            | Normalize and divide by number of examples given.                                                                                                |
 | `neg_prefix`      | <tt>str</tt>             | Prefix used to indicate that a label is negative e.g. "!sci-fi".                                                                                 |
 | `missing_value`   | <tt>Union[str, int]</tt> | Specific label that indicates the value is missing and should not be considered for training/evaluation purposes, e.g. empty string `""` or `0`. |
-| `label_smoothing` | <tt>float</tt>           | Smoothing-coefficient for label-smoothing in range of [0, 0.5[.                                                                                  |
+| `label_smoothing` | <tt>float</tt>           | Smoothing-coefficient for label-smoothing.                                                                                                       |
 
 ### SequenceCategoricalCrossentropy {#sequence_categorical_crossentropy tag="class"}
 
@@ -138,7 +137,7 @@ normalize = true
 | `normalize`       | <tt>bool</tt>            | Normalize and divide by number of examples given.                   |
 | `neg_prefix`      | <tt>str</tt>             | Symbol that indicates that a label is negative e.g. "!sci-fi".      |
 | `missing_value`   | <tt>Union[str, int]</tt> | Symbol for "missing value" among the labels.                        |
-| `label_smoothing` | <tt>float</tt>           | Smoothing-coefficient for label-smoothing in range of [0, 0.5].     |
+| `label_smoothing` | <tt>float</tt>           | Smoothing-coefficient for label-smoothing.                          |
 
 ### L2Distance {#l2distance tag="class"}
 
diff --git a/website/docs/api-util.md b/website/docs/api-util.md
index 158da5446..add7c12e1 100644
--- a/website/docs/api-util.md
+++ b/website/docs/api-util.md
@@ -133,11 +133,13 @@ element).
 Converts a class vector (integers) to binary class matrix. Based on
 [`keras.utils.to_categorical`](https://keras.io/utils/).
 
-| Argument    | Type                   | Description                                                                                    |
-| ----------- | ---------------------- | ---------------------------------------------------------------------------------------------- |
-| `Y`         | <tt>IntsXd</tt>        | Class vector to be converted into a matrix (integers from `0` to `n_classes`).                 |
-| `n_classes` | <tt>Optional[int]</tt> | Total number of classes.                                                                       |
-| **RETURNS** |  <tt>Floats2d</tt>     | A binary matrix representation of the input. The axis representing the classes is placed last. |
+| Argument          | Type                   | Description                                                                                    |
+| ----------------- | ---------------------- | ---------------------------------------------------------------------------------------------- |
+| `Y`               | <tt>IntsXd</tt>        | Class vector to be converted into a matrix (integers from `0` to `n_classes`).                 |
+| `n_classes`       | <tt>Optional[int]</tt> | Total number of classes.                                                                       |
+| _keyword-only_    |                        |                                                                                                |
+| `label_smoothing` | <tt>float</tt>         | Smoothing-coefficient for label-smoothing.                                                     |
+| **RETURNS**       | <tt>Floats2d</tt>      | A binary matrix representation of the input. The axis representing the classes is placed last. |
 
 ### xp2torch {#xp2torch tag="function"}
 

From af0e3de47968af09d248dcfb379348b1ff1239a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Thu, 4 Aug 2022 13:11:21 +0200
Subject: [PATCH 55/78] Add Dish activation (#719)

* Add Ops.(backprop_)dish and CUDA kernel

Dish is a Swish/GELU-like activation function. Since it does not rely on
elementary operations like `exp` or `erf`, it can generally be computed
faster than Swish and GELU:

https://twitter.com/danieldekok/status/1484898130441166853

* Make mypy happy

Apparently, X * X does not typecheck (?!?).

* test_compare_activations_to_torch: test with different dY

Also fix the backprop_dish CUDA kernel, which would fail now (thanks
@shadeMe).

* test_compare_activations_to_torch: be slightly more (absolute) tolerant

Or the Dish test would fail (possibly different accuracies for sqrt?).

* doc fix

* Update dish types to use `FloatsXdT`

* docs: add version tag to `(backprop_)dish`

* Add Dish Thinc layer

* Add Dish layer docs

Also update description as suggested by @kadarakos.

* Fix dish description

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>

Co-authored-by: Madeesh Kannan <shadeMe@users.noreply.github.com>
---
 thinc/api.py                          |  2 +-
 thinc/backends/_custom_kernels.cu     | 31 +++++++++++++
 thinc/backends/_custom_kernels.py     | 48 +++++++++++++++++++
 thinc/backends/cupy_ops.py            | 12 +++++
 thinc/backends/ops.py                 | 29 ++++++++++++
 thinc/layers/__init__.py              |  1 +
 thinc/layers/dish.py                  | 66 +++++++++++++++++++++++++++
 thinc/tests/backends/test_ops.py      | 29 +++++++++---
 thinc/tests/layers/test_layers_api.py |  2 +
 website/docs/api-backends.md          | 41 +++++++++++++++++
 website/docs/api-layers.md            | 33 ++++++++++++++
 11 files changed, 286 insertions(+), 8 deletions(-)
 create mode 100644 thinc/layers/dish.py

diff --git a/thinc/api.py b/thinc/api.py
index bdd001587..8c5807347 100644
--- a/thinc/api.py
+++ b/thinc/api.py
@@ -27,7 +27,7 @@
 from .layers import CauchySimilarity, ParametricAttention, Logistic
 from .layers import resizable, sigmoid_activation, Sigmoid, SparseLinear
 from .layers import ClippedLinear, ReluK, HardTanh, HardSigmoid
-from .layers import HardSwish, HardSwishMobilenet, Swish, Gelu
+from .layers import Dish, HardSwish, HardSwishMobilenet, Swish, Gelu
 from .layers import PyTorchWrapper, PyTorchRNNWrapper, PyTorchLSTM
 from .layers import TensorFlowWrapper, keras_subclass, MXNetWrapper
 from .layers import PyTorchWrapper_v2, Softmax_v2
diff --git a/thinc/backends/_custom_kernels.cu b/thinc/backends/_custom_kernels.cu
index a0017c0e4..9c9fece1e 100644
--- a/thinc/backends/_custom_kernels.cu
+++ b/thinc/backends/_custom_kernels.cu
@@ -161,6 +161,20 @@ __global__ void clipped_linear(T* Y, const T* X, double slope, double offset, do
 }
 
 
+template <typename T>
+__global__ void dish(T* Y, const T* X, int N)
+{
+    int _loop_start = blockIdx.x * blockDim.x + threadIdx.x;
+    int _loop_stride = blockDim.x * gridDim.x;
+
+    for (int i = _loop_start; i < N; i += _loop_stride)
+    {
+        T x = X[i];
+        Y[i] = 0.5 * x * (x / sqrt(1 + x * x) + 1);
+    }
+}
+
+
 template <typename T>
 __global__ void gelu(T* Y, const T* X, double threshold, int N)
 {
@@ -414,6 +428,23 @@ __global__ void backprop_hard_swish_mobilenet(T* dX, const T* dY, const T* X, in
 }
 
 
+template <typename T>
+__global__ void backprop_dish(T* dX, const T* dY, const T* X, int N)
+{
+
+    int _loop_start = blockIdx.x * blockDim.x + threadIdx.x;
+    int _loop_stride = blockDim.x * gridDim.x;
+
+    for (int i = _loop_start; i < N; i += _loop_stride)
+    {
+        T x = X[i];
+        T x_sq = x * x;
+        T x_sq_plus_one = x_sq + 1.0;
+        dX[i] = dY[i] * (x/sqrt(x_sq_plus_one) - (0.5 * x * x_sq)
+            / pow(x_sq_plus_one, static_cast<T>(1.5)) + 0.5);
+    }
+}
+
 
 template <typename T>
 __global__ void backprop_gelu(T* dX, const T* dY, const T* X,
diff --git a/thinc/backends/_custom_kernels.py b/thinc/backends/_custom_kernels.py
index d2d1ea133..859405495 100644
--- a/thinc/backends/_custom_kernels.py
+++ b/thinc/backends/_custom_kernels.py
@@ -10,6 +10,8 @@
 KERNELS_LIST = [
     "backprop_clipped_linear<double>",
     "backprop_clipped_linear<float>",
+    "backprop_dish<double>",
+    "backprop_dish<float>",
     "backprop_gelu<double>",
     "backprop_gelu<float>",
     "backprop_hard_swish<double>",
@@ -32,6 +34,8 @@
     "backprop_swish<float>",
     "clipped_linear<double>",
     "clipped_linear<float>",
+    "dish<double>",
+    "dish<float>",
     "gather_add<double>",
     "gather_add<float>",
     "gelu<double>",
@@ -78,6 +82,8 @@ def compile_mmh(src):
 
 clipped_linear_kernel_float = _get_kernel("clipped_linear<float>")
 clipped_linear_kernel_double = _get_kernel("clipped_linear<double>")
+dish_kernel_float = _get_kernel("dish<float>")
+dish_kernel_double = _get_kernel("dish<double>")
 gather_add_kernel_float = _get_kernel("gather_add<float>")
 gather_add_kernel_double = _get_kernel("gather_add<double>")
 gelu_kernel_float = _get_kernel("gelu<float>")
@@ -98,6 +104,8 @@ def compile_mmh(src):
 
 backprop_clipped_linear_kernel_double = _get_kernel("backprop_clipped_linear<double>")
 backprop_clipped_linear_kernel_float = _get_kernel("backprop_clipped_linear<float>")
+backprop_dish_kernel_double = _get_kernel("backprop_dish<double>")
+backprop_dish_kernel_float = _get_kernel("backprop_dish<float>")
 backprop_gelu_kernel_double = _get_kernel("backprop_gelu<double>")
 backprop_gelu_kernel_float = _get_kernel("backprop_gelu<float>")
 backprop_hard_swish_kernel_double = _get_kernel("backprop_hard_swish<double>")
@@ -199,6 +207,19 @@ def gather_add(table, indices, *, threads_per_block=128, num_blocks=128):
     return out
 
 
+def dish(X, *, inplace=False, threads_per_block=128, num_blocks=128):
+    _is_float_array(X)
+
+    out = X
+    if not inplace:
+        out = _alloc_like(X, zeros=False)
+    if X.dtype == "float32":
+        dish_kernel_float((num_blocks,), (threads_per_block,), (out, X, X.size))
+    else:
+        dish_kernel_double((num_blocks,), (threads_per_block,), (out, X, X.size))
+    return out
+
+
 def gelu(X, *, inplace=False, threshold=6.0, threads_per_block=128, num_blocks=128):
     _is_float_array(X)
 
@@ -483,6 +504,33 @@ def backprop_hard_swish_mobilenet(
     return out
 
 
+def backprop_dish(
+    dY,
+    X,
+    *,
+    inplace: bool = False,
+    threads_per_block=128,
+    num_blocks=128,
+):
+    _is_float_array(dY)
+    _is_float_array(X, shape=dY.shape)
+
+    out = dY
+    if not inplace:
+        out = _alloc_like(dY, zeros=False)
+
+    if dY.dtype == "float32":
+        backprop_dish_kernel_float(
+            (num_blocks,), (threads_per_block,), (out, dY, X, out.size)
+        )
+    else:
+        backprop_dish_kernel_double(
+            (num_blocks,), (threads_per_block,), (out, dY, X, out.size)
+        )
+
+    return out
+
+
 def backprop_gelu(
     dY,
     X,
diff --git a/thinc/backends/cupy_ops.py b/thinc/backends/cupy_ops.py
index 924bfe955..6d263c155 100644
--- a/thinc/backends/cupy_ops.py
+++ b/thinc/backends/cupy_ops.py
@@ -36,6 +36,18 @@ def gather_add(self, table, indices):
         else:
             return super().gather_add(table, indices)
 
+    def dish(self, X, inplace=False):
+        if X.dtype in ("float32", "float64"):
+            return _custom_kernels.dish(X, inplace=inplace)
+        else:
+            return super().dish(X, inplace=inplace)
+
+    def backprop_dish(self, dY, X, inplace=False):
+        if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
+            return _custom_kernels.backprop_dish(dY, X, inplace=inplace)
+        else:
+            return super().backprop_dish(dY, X, inplace=inplace)
+
     def gelu(self, X, inplace=False):
         if X.dtype in ("float32", "float64"):
             return _custom_kernels.gelu(X, inplace=inplace, threshold=6.0)
diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index c9fb10aae..f0c05de42 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -976,6 +976,35 @@ def backprop_hard_swish_mobilenet(
             return dY
         return dX * dY
 
+    def dish(self, X: FloatsXdT, inplace: bool = False) -> FloatsXdT:
+        tmp = self.xp.square(X)
+        tmp += 1.0
+        self.xp.sqrt(tmp, out=tmp)
+        tmp = X / tmp
+        tmp += 1
+        tmp *= 0.5
+        if inplace:
+            X *= tmp
+            return X
+        else:
+            return X * tmp
+
+    def backprop_dish(
+        self, dY: FloatsXdT, X: FloatsXdT, inplace: bool = False
+    ) -> FloatsXdT:
+        x_sq = self.xp.square(X)
+        x_sq_plus_one = x_sq + 1.0
+        deriv = X / self.xp.sqrt(x_sq_plus_one)
+        second = 0.5 * X * x_sq
+        second /= x_sq_plus_one**1.5
+        deriv -= second
+        deriv += 0.5
+        if inplace:
+            dY *= deriv
+            return dY
+        else:
+            return dY * deriv
+
     # Code snippet taken from:
     # https://www.johndcook.com/blog/2009/01/19/stand-alone-error-function-erf/
     def erf(self, X: FloatsXdT) -> FloatsXdT:
diff --git a/thinc/layers/__init__.py b/thinc/layers/__init__.py
index 4c22208fe..19fee1329 100644
--- a/thinc/layers/__init__.py
+++ b/thinc/layers/__init__.py
@@ -1,5 +1,6 @@
 # Weights layers
 from .cauchysimilarity import CauchySimilarity
+from .dish import Dish
 from .dropout import Dropout
 from .embed import Embed
 from .expand_window import expand_window
diff --git a/thinc/layers/dish.py b/thinc/layers/dish.py
new file mode 100644
index 000000000..b085946b3
--- /dev/null
+++ b/thinc/layers/dish.py
@@ -0,0 +1,66 @@
+from typing import Tuple, Optional, Callable, cast
+
+from ..config import registry
+from ..model import Model
+from .chain import chain
+from .layernorm import LayerNorm
+from .dropout import Dropout
+from ..types import Floats1d, Floats2d
+from ..util import partial, get_width
+from ..initializers import he_normal_init, zero_init
+
+
+@registry.layers("Dish.v1")
+def Dish(
+    nO: Optional[int] = None,
+    nI: Optional[int] = None,
+    *,
+    init_W: Callable = he_normal_init,
+    init_b: Callable = zero_init,
+    dropout: Optional[float] = None,
+    normalize: bool = False,
+) -> Model[Floats2d, Floats2d]:
+    model: Model[Floats2d, Floats2d] = Model(
+        "dish",
+        forward,
+        init=partial(init, init_W, init_b),
+        dims={"nO": nO, "nI": nI},
+        params={"W": None, "b": None},
+    )
+    if normalize:
+        model = chain(model, LayerNorm(nI=nO))
+    if dropout is not None:
+        model = chain(model, cast(Model[Floats2d, Floats2d], Dropout(dropout)))
+    return model
+
+
+def forward(
+    model: Model[Floats2d, Floats2d], X: Floats2d, is_train: bool
+) -> Tuple[Floats2d, Callable]:
+    W = cast(Floats2d, model.get_param("W"))
+    b = cast(Floats1d, model.get_param("b"))
+    Y_preact = model.ops.affine(X, W, b)
+    Y = model.ops.dish(Y_preact)
+
+    def backprop(dY: Floats2d) -> Floats2d:
+        dY = model.ops.backprop_dish(dY, X, inplace=False)
+        model.inc_grad("b", dY.sum(axis=0))
+        model.inc_grad("W", model.ops.gemm(dY, X, trans1=True))
+        return model.ops.gemm(dY, W)
+
+    return Y, backprop
+
+
+def init(
+    init_W: Callable,
+    init_b: Callable,
+    model: Model[Floats2d, Floats2d],
+    X: Optional[Floats2d] = None,
+    Y: Optional[Floats2d] = None,
+) -> None:
+    if X is not None:
+        model.set_dim("nI", get_width(X))
+    if Y is not None:
+        model.set_dim("nO", get_width(Y))
+    model.set_param("W", init_W(model.ops, (model.get_dim("nO"), model.get_dim("nI"))))
+    model.set_param("b", init_b(model.ops, (model.get_dim("nO"),)))
diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index 7de864623..7a9bb1961 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -64,6 +64,9 @@ def torch_hard_swish_mobilenet(x):
     def torch_sigmoid(x):
         return torch.sigmoid(x)
 
+    def torch_dish(x):
+        return 0.5 * x * (x / (1 + x * x).sqrt() + 1)
+
     # https://github.com/huggingface/transformers/blob/master/src/transformers/activations.py#L37
     def torch_gelu_approx(x):
         return (
@@ -89,6 +92,7 @@ def torch_gelu(x):
         ("swish", torch_swish),
         ("hard_swish", torch_hard_swish),
         ("hard_swish_mobilenet", torch_hard_swish_mobilenet),
+        ("dish", torch_dish),
         ("gelu_approx", torch_gelu_approx),
         ("gelu", torch_gelu),
         ("sigmoid", torch_sigmoid),
@@ -1043,6 +1047,7 @@ def test_mish(ops, X):
     "op",
     [
         "backprop_clipped_linear",
+        "backprop_dish",
         "backprop_gelu",
         "backprop_gelu_approx",
         "backprop_hard_sigmoid",
@@ -1160,6 +1165,16 @@ def test_gelu_approx(ops, X):
     assert not ops.xp.isnan(Y).any()
 
 
+@pytest.mark.parametrize("ops", ALL_OPS)
+@settings(max_examples=MAX_EXAMPLES, deadline=None)
+@given(X=strategies.arrays_BI())
+def test_dish(ops, X):
+    X = ops.asarray(X)
+    Y = ops.dish(X)
+    assert Y.shape == X.shape
+    assert not ops.xp.isnan(Y).any()
+
+
 @pytest.mark.parametrize("ops", ALL_OPS)
 @settings(max_examples=MAX_EXAMPLES, deadline=None)
 @given(X=strategies.arrays_BI())
@@ -1350,8 +1365,8 @@ def test_ngrams():
 @pytest.mark.parametrize("dtype", ["float32", "float64"])
 @pytest.mark.parametrize("torch_func", TORCH_FUNCS)
 @settings(max_examples=MAX_EXAMPLES, deadline=None)
-@given(x=strategies.floats(min_value=-30, max_value=30))
-def test_compare_activations_to_torch(ops, dtype, x, torch_func):
+@given(x=strategies.floats(min_value=-30, max_value=30), dY=strategies.floats(min_value=-1, max_value=1))
+def test_compare_activations_to_torch(ops, dtype, x, dY, torch_func):
     import torch
 
     func_name, pytorch_func = torch_func
@@ -1369,9 +1384,9 @@ def test_compare_activations_to_torch(ops, dtype, x, torch_func):
     y_think_inplace = forward(x_thinc, inplace=True)
     assert y_think_inplace is x_thinc
     assert ops.xp.isclose(y_thinc, y_think_inplace, atol=1e-06)
-    assert ops.xp.isclose(y_thinc, y.detach(), atol=1e-06)
+    assert ops.xp.isclose(y_thinc, y.detach(), atol=1e-05)
     x_thinc = ops.asarray([x], dtype=dtype)
-    dY_thinc = ops.asarray([1.0], dtype=dtype)
+    dY_thinc = ops.asarray([dY], dtype=dtype)
     dY_thinc_inplace = dY_thinc.copy()
 
     s = inspect.signature(backward)
@@ -1386,7 +1401,7 @@ def test_compare_activations_to_torch(ops, dtype, x, torch_func):
         )
         assert dx_thinc_inplace is dY_thinc_inplace
         assert ops.xp.isclose(dx_thinc, dx_thinc_inplace)
-        assert ops.xp.isclose(x_torch.grad.item(), float(dx_thinc), atol=1e-06)
+        assert ops.xp.isclose(x_torch.grad.item() * dY, float(dx_thinc), atol=1e-06)
     elif params == {"Y", "dY"}:
         dx_thinc = backward(dY_thinc, Y=y_thinc)
         assert dx_thinc.dtype == x_thinc.dtype
@@ -1394,7 +1409,7 @@ def test_compare_activations_to_torch(ops, dtype, x, torch_func):
             dx_thinc,
             backward(dY=dY_thinc_inplace, Y=y_thinc, inplace=True),
         )
-        assert ops.xp.isclose(x_torch.grad.item(), float(dx_thinc), atol=1e-06)
+        assert ops.xp.isclose(x_torch.grad.item() * dY, float(dx_thinc), atol=1e-06)
     elif params == {"dY", "X"}:
         dx_thinc = backward(dY_thinc, X=x_thinc)
         assert dx_thinc.dtype == x_thinc.dtype
@@ -1402,7 +1417,7 @@ def test_compare_activations_to_torch(ops, dtype, x, torch_func):
             dx_thinc, backward(dY=dY_thinc_inplace, X=x_thinc, inplace=True)
         )
         assert ops.xp.isclose(
-            x_torch.grad.item(), float(backward(dY_thinc, X=x_thinc)), atol=1e-06
+            x_torch.grad.item() * dY, float(backward(dY_thinc, X=x_thinc)), atol=1e-06
         )
     else:
         raise NotImplementedError(
diff --git a/thinc/tests/layers/test_layers_api.py b/thinc/tests/layers/test_layers_api.py
index 5c922f074..3ebeb470a 100644
--- a/thinc/tests/layers/test_layers_api.py
+++ b/thinc/tests/layers/test_layers_api.py
@@ -57,6 +57,8 @@ def assert_data_match(Y, out_data):
 
 TEST_CASES_SUMMABLE = [
     # Array to array
+    ("Dish.v1", {}, array2d, array2d),
+    ("Dish.v1", {"nO": 4, "nI": 4}, array2d, array2d),
     ("Dropout.v1", {}, array2d, array2d),
     ("LayerNorm.v1", {}, array2d, array2d),
     ("Linear.v1", {}, array2d, array2d),
diff --git a/website/docs/api-backends.md b/website/docs/api-backends.md
index f2cdb03ce..c5a54cff8 100644
--- a/website/docs/api-backends.md
+++ b/website/docs/api-backends.md
@@ -927,6 +927,47 @@ Backpropagate the Swish activation
 | `inplace`   | <tt>bool</tt>     | If `True`, the `dY` array is modified in place. |
 | **RETURNS** | <tt>FloatsXd</tt> | The gradient of the input.                      |
 
+### Ops.dish {#dish tag="method" new="8.1.1"}
+
+<inline-list>
+
+- **default:** <i name="yes"></i>
+- **numpy:** <i name="no"></i>
+- **cupy:** <i name="yes"></i>
+
+</inline-list>
+
+Dish or "Daniël's Swish-like activation" is an activation function with a non-monotinic shape similar to
+[GELU](#gelu), [Swish](#swish) and [Mish](#mish). However, Dish does not rely on
+elementary functions like `exp` or `erf`, making it much
+[faster to compute](https://twitter.com/danieldekok/status/1484898130441166853)
+in most cases.
+
+| Argument    | Type              | Description                                |
+| ----------- | ----------------- | ------------------------------------------ |
+| `X`         | <tt>FloatsXd</tt> | The inputs.                                |
+| `inplace`   | <tt>bool</tt>     | If `True`, the array is modified in place. |
+| **RETURNS** | <tt>FloatsXd</tt> | The outputs.                               |
+
+### Ops.backprop_dish {#backprop_dish tag="method" new="8.1.1"}
+
+<inline-list>
+
+- **default:** <i name="yes"></i>
+- **numpy:** <i name="no"></i>
+- **cupy:** <i name="yes"></i>
+
+</inline-list>
+
+Backpropagate the Dish activation.
+
+| Argument    | Type              | Description                                     |
+| ----------- | ----------------- | ----------------------------------------------- |
+| `dY`        | <tt>FloatsXd</tt> | Gradients of the output array.                  |
+| `X`         | <tt>FloatsXd</tt> | The inputs to the forward pass.                 |
+| `inplace`   | <tt>bool</tt>     | If `True`, the `dY` array is modified in place. |
+| **RETURNS** | <tt>FloatsXd</tt> | The gradient of the input.                      |
+
 ### Ops.gelu {#gelu tag="method"}
 
 <inline-list>
diff --git a/website/docs/api-layers.md b/website/docs/api-layers.md
index dc1a2be2e..b1e72f7d7 100644
--- a/website/docs/api-layers.md
+++ b/website/docs/api-layers.md
@@ -44,6 +44,39 @@ Primarily used within [`siamese`](#siamese) neural networks.
 https://github.com/explosion/thinc/blob/master/thinc/layers/cauchysimilarity.py
 ```
 
+### Dish {#dish tag="function"}
+
+<inline-list>
+
+- **Input:** <ndarray shape="batch_size, nI">Floats2d</ndarray>
+- **Output:** <ndarray shape="batch_size, nO">Floats2d</ndarray>
+- **Parameters:** <ndarray shape="nO, nI">W</ndarray>,
+  <ndarray shape="nO,">b</ndarray>
+
+</inline-list>
+
+A dense layer with the Dish activation function. Dish or "Daniël's Swish-like
+activation" is an activation function with a non-monotinic shape similar to
+[GELU](#gelu), [Swish](#swish) and [Mish](#mish). However, Dish does not rely on
+elementary functions like `exp` or `erf`, making it much
+[faster to compute](https://twitter.com/danieldekok/status/1484898130441166853)
+in most cases.
+
+| Argument       | Type                               | Description                                                                                                        |
+| -------------- | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------ |
+| `nO`           | <tt>Optional[int]</tt>             | The size of the output vectors.                                                                                    |
+| `nI`           | <tt>Optional[int]</tt>             | The size of the input vectors.                                                                                     |
+| _keyword-only_ |                                    |                                                                                                                    |
+| `init_W`       | <tt>Callable</tt>                  | A function to initialize the weights matrix. Defaults to [`he_normal_init`](/docs/api-initializers#he_normal_init) |
+| `init_b`       | <tt>Callable</tt>                  | A function to initialize the bias vector. Defaults to [`zero_init`](/docs/api-initializers#zero_init).             |
+| `dropout`      | <tt>Optional[float]</tt>           | Dropout rate to avoid overfitting.                                                                                 |
+| `normalize`    | <tt>bool</tt>                      | Whether or not to apply [layer normalization](#layernorm). Defaults to `False`.                                    |
+| **RETURNS**    | <tt>Model[Floats2d, Floats2d]</tt> | The created dense layer.                                                                                           |
+
+```python
+https://github.com/explosion/thinc/blob/master/thinc/layers/dish.py
+```
+
 ### Dropout {#dropout tag="function"}
 
 <inline-list>

From 7fcdd0f6d196dd997b162d7350d7e01418bec5c5 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 5 Aug 2022 10:40:54 +0200
Subject: [PATCH 56/78] Auto-format code with black (#737)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 thinc/tests/backends/test_ops.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/thinc/tests/backends/test_ops.py b/thinc/tests/backends/test_ops.py
index 7a9bb1961..04ab7d231 100644
--- a/thinc/tests/backends/test_ops.py
+++ b/thinc/tests/backends/test_ops.py
@@ -1365,7 +1365,10 @@ def test_ngrams():
 @pytest.mark.parametrize("dtype", ["float32", "float64"])
 @pytest.mark.parametrize("torch_func", TORCH_FUNCS)
 @settings(max_examples=MAX_EXAMPLES, deadline=None)
-@given(x=strategies.floats(min_value=-30, max_value=30), dY=strategies.floats(min_value=-1, max_value=1))
+@given(
+    x=strategies.floats(min_value=-30, max_value=30),
+    dY=strategies.floats(min_value=-1, max_value=1),
+)
 def test_compare_activations_to_torch(ops, dtype, x, dY, torch_func):
     import torch
 

From d95b5fc3cfa0f8b2a3605c7d83b52d27e3ec4a82 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Fri, 5 Aug 2022 10:41:26 +0200
Subject: [PATCH 57/78] Increment `blis` version upper-bound to `0.10.0` (#736)

---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.cfg        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index d3fb69b76..6fcca505e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "murmurhash>=1.0.2,<1.1.0",
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
-    "blis>=0.7.8,<0.8.0",
+    "blis>=0.7.8,<0.10.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 05d77e403..a1378a3fc 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 murmurhash>=1.0.2,<1.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-blis>=0.7.8,<0.8.0
+blis>=0.7.8,<0.10.0
 srsly>=2.4.0,<3.0.0
 wasabi>=0.8.1,<1.1.0
 catalogue>=2.0.4,<2.1.0
diff --git a/setup.cfg b/setup.cfg
index 0da060213..198823d14 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -35,10 +35,10 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=1.0.2,<1.1.0
-    blis>=0.7.8,<0.8.0
+    blis>=0.7.8,<0.10.0
 install_requires =
     # Explosion-provided dependencies
-    blis>=0.7.8,<0.8.0
+    blis>=0.7.8,<0.10.0
     murmurhash>=1.0.2,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0

From 01eb6b7808af308983bfd71fd456710e50f6e08d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 5 Aug 2022 13:46:12 +0200
Subject: [PATCH 58/78] asarrayDf: take `Sequence[float]`, not `Sequence[int]`
 (#739)

---
 thinc/backends/ops.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index f0c05de42..29b4eca4f 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -626,7 +626,7 @@ def reshape(self, array: ArrayT, shape: Shape) -> ArrayT:
 
     def asarray4f(
         self,
-        data: Union[Floats4d, Sequence[int]],
+        data: Union[Floats4d, Sequence[float]],
         *,
         dtype: Optional[DTypes] = "float32",
     ) -> Floats4d:
@@ -634,7 +634,7 @@ def asarray4f(
 
     def asarray3f(
         self,
-        data: Union[Floats3d, Sequence[int]],
+        data: Union[Floats3d, Sequence[float]],
         *,
         dtype: Optional[DTypes] = "float32",
     ) -> Floats3d:
@@ -642,7 +642,7 @@ def asarray3f(
 
     def asarray2f(
         self,
-        data: Union[Floats2d, Sequence[int]],
+        data: Union[Floats2d, Sequence[float]],
         *,
         dtype: Optional[DTypes] = "float32",
     ) -> Floats2d:
@@ -650,7 +650,7 @@ def asarray2f(
 
     def asarray1f(
         self,
-        data: Union[Floats1d, Sequence[int]],
+        data: Union[Floats1d, Sequence[float]],
         *,
         dtype: Optional[DTypes] = "float32",
     ) -> Floats1d:

From a43635ea1eea414e2f7678b1e5c4f911f39af93e Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Fri, 26 Aug 2022 16:23:13 +0200
Subject: [PATCH 59/78] Use confection for configurations (#745)

* Remove redundant tests. Add confection to requirement.txt and setup.cfg. Adjust cnfig.py.

* Add reference to confection in website/docs/usage-config.md.

* Update confection reference in docs.

* Extend imports fro confection for backwards compatibility.
---
 requirements.txt             |    1 +
 setup.cfg                    |    1 +
 thinc/config.py              | 1038 +---------------------------
 thinc/tests/test_config.py   | 1240 ----------------------------------
 website/docs/usage-config.md |   35 +-
 5 files changed, 23 insertions(+), 2292 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index a1378a3fc..6a04190da 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -34,3 +34,4 @@ nbformat>=5.0.4,<5.2.0
 # Test to_disk/from_disk against pathlib.Path subclasses
 pathy>=0.3.5
 black>=22.0,<23.0
+confection>=0.0.1,<1.0.0
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index 198823d14..2c147b995 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -45,6 +45,7 @@ install_requires =
     wasabi>=0.8.1,<1.1.0
     srsly>=2.4.0,<3.0.0
     catalogue>=2.0.4,<2.1.0
+    confection>=0.0.1,<1.0.0
     # Third-party dependencies
     setuptools
     numpy>=1.15.0
diff --git a/thinc/config.py b/thinc/config.py
index 837f91b76..8c0e752c5 100644
--- a/thinc/config.py
+++ b/thinc/config.py
@@ -1,701 +1,10 @@
-from typing import Union, Dict, Any, Optional, List, Tuple, Callable, Type, Mapping
-from typing import Iterable, Sequence, cast
-from types import GeneratorType
-from dataclasses import dataclass
-from configparser import ConfigParser, ExtendedInterpolation, MAX_INTERPOLATION_DEPTH
-from configparser import InterpolationMissingOptionError, InterpolationSyntaxError
-from configparser import NoSectionError, NoOptionError, InterpolationDepthError
-from configparser import ParsingError
-from pathlib import Path
-from pydantic import BaseModel, create_model, ValidationError, Extra
-from pydantic.main import ModelMetaclass
-from pydantic.fields import ModelField
-from wasabi import table
-import srsly
 import catalogue
-import inspect
-import io
-import numpy
-import copy
-import re
-
+import confection
+from confection import Config, ConfigValidationError, Promise, VARIABLE_RE
 from .types import Decorator
 
 
-# Field used for positional arguments, e.g. [section.*.xyz]. The alias is
-# required for the schema (shouldn't clash with user-defined arg names)
-ARGS_FIELD = "*"
-ARGS_FIELD_ALIAS = "VARIABLE_POSITIONAL_ARGS"
-# Aliases for fields that would otherwise shadow pydantic attributes. Can be any
-# string, so we're using name + space so it looks the same in error messages etc.
-RESERVED_FIELDS = {"validate": "validate\u0020"}
-# Internal prefix used to mark section references for custom interpolation
-SECTION_PREFIX = "__SECTION__:"
-# Values that shouldn't be loaded during interpolation because it'd cause
-# even explicit string values to be incorrectly parsed as bools/None etc.
-JSON_EXCEPTIONS = ("true", "false", "null")
-# Regex to detect whether a value contains a variable
-VARIABLE_RE = re.compile(r"\$\{[\w\.:]+\}")
-
-
-class CustomInterpolation(ExtendedInterpolation):
-    def before_read(self, parser, section, option, value):
-        # If we're dealing with a quoted string as the interpolation value,
-        # make sure we load and unquote it so we don't end up with '"value"'
-        try:
-            json_value = srsly.json_loads(value)
-            if isinstance(json_value, str) and json_value not in JSON_EXCEPTIONS:
-                value = json_value
-        except Exception:
-            pass
-        return super().before_read(parser, section, option, value)
-
-    def before_get(self, parser, section, option, value, defaults):
-        # Mostly copy-pasted from the built-in configparser implementation.
-        L = []
-        self.interpolate(parser, option, L, value, section, defaults, 1)
-        return "".join(L)
-
-    def interpolate(self, parser, option, accum, rest, section, map, depth):
-        # Mostly copy-pasted from the built-in configparser implementation.
-        # We need to overwrite this method so we can add special handling for
-        # block references :( All values produced here should be strings –
-        # we need to wait until the whole config is interpreted anyways so
-        # filling in incomplete values here is pointless. All we need is the
-        # section reference so we can fetch it later.
-        rawval = parser.get(section, option, raw=True, fallback=rest)
-        if depth > MAX_INTERPOLATION_DEPTH:
-            raise InterpolationDepthError(option, section, rawval)
-        while rest:
-            p = rest.find("$")
-            if p < 0:
-                accum.append(rest)
-                return
-            if p > 0:
-                accum.append(rest[:p])
-                rest = rest[p:]
-            # p is no longer used
-            c = rest[1:2]
-            if c == "$":
-                accum.append("$")
-                rest = rest[2:]
-            elif c == "{":
-                # We want to treat both ${a:b} and ${a.b} the same
-                m = self._KEYCRE.match(rest)
-                if m is None:
-                    err = f"bad interpolation variable reference {rest}"
-                    raise InterpolationSyntaxError(option, section, err)
-                orig_var = m.group(1)
-                path = orig_var.replace(":", ".").rsplit(".", 1)
-                rest = rest[m.end() :]
-                sect = section
-                opt = option
-                try:
-                    if len(path) == 1:
-                        opt = parser.optionxform(path[0])
-                        if opt in map:
-                            v = map[opt]
-                        else:
-                            # We have block reference, store it as a special key
-                            section_name = parser[parser.optionxform(path[0])]._name
-                            v = self._get_section_name(section_name)
-                    elif len(path) == 2:
-                        sect = path[0]
-                        opt = parser.optionxform(path[1])
-                        fallback = "__FALLBACK__"
-                        v = parser.get(sect, opt, raw=True, fallback=fallback)
-                        # If a variable doesn't exist, try again and treat the
-                        # reference as a section
-                        if v == fallback:
-                            v = self._get_section_name(parser[f"{sect}.{opt}"]._name)
-                    else:
-                        err = f"More than one ':' found: {rest}"
-                        raise InterpolationSyntaxError(option, section, err)
-                except (KeyError, NoSectionError, NoOptionError):
-                    raise InterpolationMissingOptionError(
-                        option, section, rawval, orig_var
-                    ) from None
-                if "$" in v:
-                    new_map = dict(parser.items(sect, raw=True))
-                    self.interpolate(parser, opt, accum, v, sect, new_map, depth + 1)
-                else:
-                    accum.append(v)
-            else:
-                err = "'$' must be followed by '$' or '{', " "found: %r" % (rest,)
-                raise InterpolationSyntaxError(option, section, err)
-
-    def _get_section_name(self, name: str) -> str:
-        """Generate the name of a section. Note that we use a quoted string here
-        so we can use section references within lists and load the list as
-        JSON. Since section references can't be used within strings, we don't
-        need the quoted vs. unquoted distinction like we do for variables.
-
-        Examples (assuming section = {"foo": 1}):
-            - value: ${section.foo} -> value: 1
-            - value: "hello ${section.foo}" -> value: "hello 1"
-            - value: ${section} -> value: {"foo": 1}
-            - value: "${section}" -> value: {"foo": 1}
-            - value: "hello ${section}" -> invalid
-        """
-        return f'"{SECTION_PREFIX}{name}"'
-
-
-def get_configparser(interpolate: bool = True):
-    config = ConfigParser(interpolation=CustomInterpolation() if interpolate else None)
-    # Preserve case of keys: https://stackoverflow.com/a/1611877/6400719
-    config.optionxform = str  # type: ignore
-    return config
-
-
-class Config(dict):
-    """This class holds the model and training configuration and can load and
-    save the TOML-style configuration format from/to a string, file or bytes.
-    The Config class is a subclass of dict and uses Python's ConfigParser
-    under the hood.
-    """
-
-    is_interpolated: bool
-
-    def __init__(
-        self,
-        data: Optional[Union[Dict[str, Any], "ConfigParser", "Config"]] = None,
-        *,
-        is_interpolated: Optional[bool] = None,
-        section_order: Optional[List[str]] = None,
-    ) -> None:
-        """Initialize a new Config object with optional data."""
-        dict.__init__(self)
-        if data is None:
-            data = {}
-        if not isinstance(data, (dict, Config, ConfigParser)):
-            raise ValueError(
-                f"Can't initialize Config with data. Expected dict, Config or "
-                f"ConfigParser but got: {type(data)}"
-            )
-        # Whether the config has been interpolated. We can use this to check
-        # whether we need to interpolate again when it's resolved. We assume
-        # that a config is interpolated by default.
-        if is_interpolated is not None:
-            self.is_interpolated = is_interpolated
-        elif isinstance(data, Config):
-            self.is_interpolated = data.is_interpolated
-        else:
-            self.is_interpolated = True
-        if section_order is not None:
-            self.section_order = section_order
-        elif isinstance(data, Config):
-            self.section_order = data.section_order
-        else:
-            self.section_order = []
-        # Update with data
-        self.update(self._sort(data))
-
-    def interpolate(self) -> "Config":
-        """Interpolate a config. Returns a copy of the object."""
-        # This is currently the most effective way because we need our custom
-        # to_str logic to run in order to re-serialize the values so we can
-        # interpolate them again. ConfigParser.read_dict will just call str()
-        # on all values, which isn't enough.
-        return Config().from_str(self.to_str())
-
-    def interpret_config(self, config: "ConfigParser") -> None:
-        """Interpret a config, parse nested sections and parse the values
-        as JSON. Mostly used internally and modifies the config in place.
-        """
-        self._validate_sections(config)
-        # Sort sections by depth, so that we can iterate breadth-first. This
-        # allows us to check that we're not expanding an undefined block.
-        get_depth = lambda item: len(item[0].split("."))
-        for section, values in sorted(config.items(), key=get_depth):
-            if section == "DEFAULT":
-                # Skip [DEFAULT] section so it doesn't cause validation error
-                continue
-            parts = section.split(".")
-            node = self
-            for part in parts[:-1]:
-                if part == "*":
-                    node = node.setdefault(part, {})
-                elif part not in node:
-                    err_title = f"Error parsing config section. Perhaps a section name is wrong?"
-                    err = [{"loc": parts, "msg": f"Section '{part}' is not defined"}]
-                    raise ConfigValidationError(
-                        config=self, errors=err, title=err_title
-                    )
-                else:
-                    node = node[part]
-            if not isinstance(node, dict):
-                # Happens if both value *and* subsection were defined for a key
-                err = [{"loc": parts, "msg": "found conflicting values"}]
-                err_cfg = f"{self}\n{({part: dict(values)})}"
-                raise ConfigValidationError(config=err_cfg, errors=err)
-            # Set the default section
-            node = node.setdefault(parts[-1], {})
-            if not isinstance(node, dict):
-                # Happens if both value *and* subsection were defined for a key
-                err = [{"loc": parts, "msg": "found conflicting values"}]
-                err_cfg = f"{self}\n{({part: dict(values)})}"
-                raise ConfigValidationError(config=err_cfg, errors=err)
-            try:
-                keys_values = list(values.items())
-            except InterpolationMissingOptionError as e:
-                raise ConfigValidationError(desc=f"{e}") from None
-            for key, value in keys_values:
-                config_v = config.get(section, key)
-                node[key] = self._interpret_value(config_v)
-        self.replace_section_refs(self)
-
-    def replace_section_refs(
-        self, config: Union[Dict[str, Any], "Config"], parent: str = ""
-    ) -> None:
-        """Replace references to section blocks in the final config."""
-        for key, value in config.items():
-            key_parent = f"{parent}.{key}".strip(".")
-            if isinstance(value, dict):
-                self.replace_section_refs(value, parent=key_parent)
-            elif isinstance(value, list):
-                config[key] = [
-                    self._get_section_ref(v, parent=[parent, key]) for v in value
-                ]
-            else:
-                config[key] = self._get_section_ref(value, parent=[parent, key])
-
-    def _interpret_value(self, value: Any) -> Any:
-        """Interpret a single config value."""
-        result = try_load_json(value)
-        # If value is a string and it contains a variable, use original value
-        # (not interpreted string, which could lead to double quotes:
-        # ${x.y} -> "${x.y}" -> "'${x.y}'"). Make sure to check it's a string,
-        # so we're not keeping lists as strings.
-        # NOTE: This currently can't handle uninterpolated values like [${x.y}]!
-        if isinstance(result, str) and VARIABLE_RE.search(value):
-            result = value
-        if isinstance(result, list):
-            return [self._interpret_value(v) for v in result]
-        return result
-
-    def _get_section_ref(self, value: Any, *, parent: List[str] = []) -> Any:
-        """Get a single section reference."""
-        if isinstance(value, str) and value.startswith(f'"{SECTION_PREFIX}'):
-            value = try_load_json(value)
-        if isinstance(value, str) and value.startswith(SECTION_PREFIX):
-            parts = value.replace(SECTION_PREFIX, "").split(".")
-            result = self
-            for item in parts:
-                try:
-                    result = result[item]
-                except (KeyError, TypeError):  # This should never happen
-                    err_title = "Error parsing reference to config section"
-                    err_msg = f"Section '{'.'.join(parts)}' is not defined"
-                    err = [{"loc": parts, "msg": err_msg}]
-                    raise ConfigValidationError(
-                        config=self, errors=err, title=err_title
-                    ) from None
-            return result
-        elif isinstance(value, str) and SECTION_PREFIX in value:
-            # String value references a section (either a dict or return
-            # value of promise). We can't allow this, since variables are
-            # always interpolated *before* configs are resolved.
-            err_desc = (
-                "Can't reference whole sections or return values of function "
-                "blocks inside a string or list\n\nYou can change your variable to "
-                "reference a value instead. Keep in mind that it's not "
-                "possible to interpolate the return value of a registered "
-                "function, since variables are interpolated when the config "
-                "is loaded, and registered functions are resolved afterwards."
-            )
-            err = [{"loc": parent, "msg": "uses section variable in string or list"}]
-            raise ConfigValidationError(errors=err, desc=err_desc)
-        return value
-
-    def copy(self) -> "Config":
-        """Deepcopy the config."""
-        try:
-            config = copy.deepcopy(self)
-        except Exception as e:
-            raise ValueError(f"Couldn't deep-copy config: {e}") from e
-        return Config(
-            config,
-            is_interpolated=self.is_interpolated,
-            section_order=self.section_order,
-        )
-
-    def merge(
-        self, updates: Union[Dict[str, Any], "Config"], remove_extra: bool = False
-    ) -> "Config":
-        """Deep merge the config with updates, using current as defaults."""
-        defaults = self.copy()
-        updates = Config(updates).copy()
-        merged = deep_merge_configs(updates, defaults, remove_extra=remove_extra)
-        return Config(
-            merged,
-            is_interpolated=defaults.is_interpolated and updates.is_interpolated,
-            section_order=defaults.section_order,
-        )
-
-    def _sort(
-        self, data: Union["Config", "ConfigParser", Dict[str, Any]]
-    ) -> Dict[str, Any]:
-        """Sort sections using the currently defined sort order. Sort
-        sections by index on section order, if available, then alphabetic, and
-        account for subsections, which should always follow their parent.
-        """
-        sort_map = {section: i for i, section in enumerate(self.section_order)}
-        sort_key = lambda x: (
-            sort_map.get(x[0].split(".")[0], len(sort_map)),
-            _mask_positional_args(x[0]),
-        )
-        return dict(sorted(data.items(), key=sort_key))
-
-    def _set_overrides(self, config: "ConfigParser", overrides: Dict[str, Any]) -> None:
-        """Set overrides in the ConfigParser before config is interpreted."""
-        err_title = "Error parsing config overrides"
-        for key, value in overrides.items():
-            err_msg = "not a section value that can be overwritten"
-            err = [{"loc": key.split("."), "msg": err_msg}]
-            if "." not in key:
-                raise ConfigValidationError(errors=err, title=err_title)
-            section, option = key.rsplit(".", 1)
-            # Check for section and accept if option not in config[section]
-            if section not in config:
-                raise ConfigValidationError(errors=err, title=err_title)
-            config.set(section, option, try_dump_json(value, overrides))
-
-    def _validate_sections(self, config: "ConfigParser") -> None:
-        # If the config defines top-level properties that are not sections (e.g.
-        # if config was constructed from dict), those values would be added as
-        # [DEFAULTS] and included in *every other section*. This is usually not
-        # what we want and it can lead to very confusing results.
-        default_section = config.defaults()
-        if default_section:
-            err_title = "Found config values without a top-level section"
-            err_msg = "not part of a section"
-            err = [{"loc": [k], "msg": err_msg} for k in default_section]
-            raise ConfigValidationError(errors=err, title=err_title)
-
-    def from_str(
-        self, text: str, *, interpolate: bool = True, overrides: Dict[str, Any] = {}
-    ) -> "Config":
-        """Load the config from a string."""
-        config = get_configparser(interpolate=interpolate)
-        if overrides:
-            config = get_configparser(interpolate=False)
-        try:
-            config.read_string(text)
-        except ParsingError as e:
-            desc = f"Make sure the sections and values are formatted correctly.\n\n{e}"
-            raise ConfigValidationError(desc=desc) from None
-        config._sections = self._sort(config._sections)
-        self._set_overrides(config, overrides)
-        self.clear()
-        self.interpret_config(config)
-        if overrides and interpolate:
-            # do the interpolation. Avoids recursion because the new call from_str call will have overrides as empty
-            self = self.interpolate()
-        self.is_interpolated = interpolate
-        return self
-
-    def to_str(self, *, interpolate: bool = True) -> str:
-        """Write the config to a string."""
-        flattened = get_configparser(interpolate=interpolate)
-        queue: List[Tuple[tuple, "Config"]] = [(tuple(), self)]
-        for path, node in queue:
-            section_name = ".".join(path)
-            is_kwarg = path and path[-1] != "*"
-            if is_kwarg and not flattened.has_section(section_name):
-                # Always create sections for non-'*' sections, not only if
-                # they have leaf entries, as we don't want to expand
-                # blocks that are undefined
-                flattened.add_section(section_name)
-            for key, value in node.items():
-                if hasattr(value, "items"):
-                    # Reference to a function with no arguments, serialize
-                    # inline as a dict and don't create new section
-                    if registry.is_promise(value) and len(value) == 1 and is_kwarg:
-                        flattened.set(section_name, key, try_dump_json(value, node))
-                    else:
-                        queue.append((path + (key,), value))
-                else:
-                    flattened.set(section_name, key, try_dump_json(value, node))
-        # Order so subsection follow parent (not all sections, then all subs etc.)
-        flattened._sections = self._sort(flattened._sections)
-        self._validate_sections(flattened)
-        string_io = io.StringIO()
-        flattened.write(string_io)
-        return string_io.getvalue().strip()
-
-    def to_bytes(self, *, interpolate: bool = True) -> bytes:
-        """Serialize the config to a byte string."""
-        return self.to_str(interpolate=interpolate).encode("utf8")
-
-    def from_bytes(
-        self,
-        bytes_data: bytes,
-        *,
-        interpolate: bool = True,
-        overrides: Dict[str, Any] = {},
-    ) -> "Config":
-        """Load the config from a byte string."""
-        return self.from_str(
-            bytes_data.decode("utf8"), interpolate=interpolate, overrides=overrides
-        )
-
-    def to_disk(self, path: Union[str, Path], *, interpolate: bool = True):
-        """Serialize the config to a file."""
-        path = Path(path) if isinstance(path, str) else path
-        with path.open("w", encoding="utf8") as file_:
-            file_.write(self.to_str(interpolate=interpolate))
-
-    def from_disk(
-        self,
-        path: Union[str, Path],
-        *,
-        interpolate: bool = True,
-        overrides: Dict[str, Any] = {},
-    ) -> "Config":
-        """Load config from a file."""
-        path = Path(path) if isinstance(path, str) else path
-        with path.open("r", encoding="utf8") as file_:
-            text = file_.read()
-        return self.from_str(text, interpolate=interpolate, overrides=overrides)
-
-
-def _mask_positional_args(name: str) -> List[Optional[str]]:
-    """Create a section name representation that masks names
-    of positional arguments to retain their order in sorts."""
-
-    stable_name = cast(List[Optional[str]], name.split("."))
-
-    # Remove names of sections that are a positional argument.
-    for i in range(1, len(stable_name)):
-        if stable_name[i - 1] == "*":
-            stable_name[i] = None
-
-    return stable_name
-
-
-def try_load_json(value: str) -> Any:
-    """Load a JSON string if possible, otherwise default to original value."""
-    try:
-        return srsly.json_loads(value)
-    except Exception:
-        return value
-
-
-def try_dump_json(value: Any, data: Union[Dict[str, dict], Config, str] = "") -> str:
-    """Dump a config value as JSON and output user-friendly error if it fails."""
-    # Special case if we have a variable: it's already a string so don't dump
-    # to preserve ${x:y} vs. "${x:y}"
-    if isinstance(value, str) and VARIABLE_RE.search(value):
-        return value
-    if isinstance(value, str) and value.replace(".", "", 1).isdigit():
-        # Work around values that are strings but numbers
-        value = f'"{value}"'
-    try:
-        return srsly.json_dumps(value)
-    except Exception as e:
-        err_msg = (
-            f"Couldn't serialize config value of type {type(value)}: {e}. Make "
-            f"sure all values in your config are JSON-serializable. If you want "
-            f"to include Python objects, use a registered function that returns "
-            f"the object instead."
-        )
-        raise ConfigValidationError(config=data, desc=err_msg) from e
-
-
-def deep_merge_configs(
-    config: Union[Dict[str, Any], Config],
-    defaults: Union[Dict[str, Any], Config],
-    *,
-    remove_extra: bool = False,
-) -> Union[Dict[str, Any], Config]:
-    """Deep merge two configs."""
-    if remove_extra:
-        # Filter out values in the original config that are not in defaults
-        keys = list(config.keys())
-        for key in keys:
-            if key not in defaults:
-                del config[key]
-    for key, value in defaults.items():
-        if isinstance(value, dict):
-            node = config.setdefault(key, {})
-            if not isinstance(node, dict):
-                continue
-            value_promises = [k for k in value if k.startswith("@")]
-            value_promise = value_promises[0] if value_promises else None
-            node_promises = [k for k in node if k.startswith("@")] if node else []
-            node_promise = node_promises[0] if node_promises else None
-            # We only update the block from defaults if it refers to the same
-            # registered function
-            if (
-                value_promise
-                and node_promise
-                and (
-                    value_promise in node
-                    and node[value_promise] != value[value_promise]
-                )
-            ):
-                continue
-            if node_promise and (
-                node_promise not in value or node[node_promise] != value[node_promise]
-            ):
-                continue
-            defaults = deep_merge_configs(node, value, remove_extra=remove_extra)
-        elif key not in config:
-            config[key] = value
-    return config
-
-
-class ConfigValidationError(ValueError):
-    def __init__(
-        self,
-        *,
-        config: Optional[Union[Config, Dict[str, Dict[str, Any]], str]] = None,
-        errors: Union[Sequence[Mapping[str, Any]], Iterable[Dict[str, Any]]] = tuple(),
-        title: Optional[str] = "Config validation error",
-        desc: Optional[str] = None,
-        parent: Optional[str] = None,
-        show_config: bool = True,
-    ) -> None:
-        """Custom error for validating configs.
-
-        config (Union[Config, Dict[str, Dict[str, Any]], str]): The
-            config the validation error refers to.
-        errors (Union[Sequence[Mapping[str, Any]], Iterable[Dict[str, Any]]]):
-            A list of errors as dicts with keys "loc" (list of strings
-            describing the path of the value), "msg" (validation message
-            to show) and optional "type" (mostly internals).
-            Same format as produced by pydantic's validation error (e.errors()).
-        title (str): The error title.
-        desc (str): Optional error description, displayed below the title.
-        parent (str): Optional parent to use as prefix for all error locations.
-            For example, parent "element" will result in "element -> a -> b".
-        show_config (bool): Whether to print the whole config with the error.
-
-        ATTRIBUTES:
-        config (Union[Config, Dict[str, Dict[str, Any]], str]): The config.
-        errors (Iterable[Dict[str, Any]]): The errors.
-        error_types (Set[str]): All "type" values defined in the errors, if
-            available. This is most relevant for the pydantic errors that define
-            types like "type_error.integer". This attribute makes it easy to
-            check if a config validation error includes errors of a certain
-            type, e.g. to log additional information or custom help messages.
-        title (str): The title.
-        desc (str): The description.
-        parent (str): The parent.
-        show_config (bool): Whether to show the config.
-        text (str): The formatted error text.
-        """
-        self.config = config
-        self.errors = errors
-        self.title = title
-        self.desc = desc
-        self.parent = parent
-        self.show_config = show_config
-        self.error_types = set()
-        for error in self.errors:
-            err_type = error.get("type")
-            if err_type:
-                self.error_types.add(err_type)
-        self.text = self._format()
-        ValueError.__init__(self, self.text)
-
-    @classmethod
-    def from_error(
-        cls,
-        err: "ConfigValidationError",
-        title: Optional[str] = None,
-        desc: Optional[str] = None,
-        parent: Optional[str] = None,
-        show_config: Optional[bool] = None,
-    ) -> "ConfigValidationError":
-        """Create a new ConfigValidationError based on an existing error, e.g.
-        to re-raise it with different settings. If no overrides are provided,
-        the values from the original error are used.
-
-        err (ConfigValidationError): The original error.
-        title (str): Overwrite error title.
-        desc (str): Overwrite error description.
-        parent (str): Overwrite error parent.
-        show_config (bool): Overwrite whether to show config.
-        RETURNS (ConfigValidationError): The new error.
-        """
-        return cls(
-            config=err.config,
-            errors=err.errors,
-            title=title if title is not None else err.title,
-            desc=desc if desc is not None else err.desc,
-            parent=parent if parent is not None else err.parent,
-            show_config=show_config if show_config is not None else err.show_config,
-        )
-
-    def _format(self) -> str:
-        """Format the error message."""
-        loc_divider = "->"
-        data = []
-        for error in self.errors:
-            err_loc = f" {loc_divider} ".join([str(p) for p in error.get("loc", [])])
-            if self.parent:
-                err_loc = f"{self.parent} {loc_divider} {err_loc}"
-            data.append((err_loc, error.get("msg")))
-        result = []
-        if self.title:
-            result.append(self.title)
-        if self.desc:
-            result.append(self.desc)
-        if data:
-            result.append(table(data))
-        if self.config and self.show_config:
-            result.append(f"{self.config}")
-        return "\n\n" + "\n".join(result)
-
-
-def alias_generator(name: str) -> str:
-    """Generate field aliases in promise schema."""
-    # Underscore fields are not allowed in model, so use alias
-    if name == ARGS_FIELD_ALIAS:
-        return ARGS_FIELD
-    # Auto-alias fields that shadow base model attributes
-    if name in RESERVED_FIELDS:
-        return RESERVED_FIELDS[name]
-    return name
-
-
-def copy_model_field(field: ModelField, type_: Any) -> ModelField:
-    """Copy a model field and assign a new type, e.g. to accept an Any type
-    even though the original value is typed differently.
-    """
-    return ModelField(
-        name=field.name,
-        type_=type_,
-        class_validators=field.class_validators,
-        model_config=field.model_config,
-        default=field.default,
-        default_factory=field.default_factory,
-        required=field.required,
-    )
-
-
-class EmptySchema(BaseModel):
-    class Config:
-        extra = "allow"
-        arbitrary_types_allowed = True
-
-
-class _PromiseSchemaConfig:
-    extra = "forbid"
-    arbitrary_types_allowed = True
-    alias_generator = alias_generator
-
-
-@dataclass
-class Promise:
-    registry: str
-    name: str
-    args: List[str]
-    kwargs: Dict[str, Any]
-
-
-class registry(object):
+class registry(confection.registry):
     # fmt: off
     optimizers: Decorator = catalogue.create("thinc", "optimizers", entry_points=True)
     schedules: Decorator = catalogue.create("thinc", "schedules", entry_points=True)
@@ -716,346 +25,5 @@ def create(cls, registry_name: str, entry_points: bool = False) -> None:
         )
         setattr(cls, registry_name, reg)
 
-    @classmethod
-    def has(cls, registry_name: str, func_name: str) -> bool:
-        """Check whether a function is available in a registry."""
-        if not hasattr(cls, registry_name):
-            return False
-        reg = getattr(cls, registry_name)
-        return func_name in reg
-
-    @classmethod
-    def get(cls, registry_name: str, func_name: str) -> Callable:
-        """Get a registered function from a given registry."""
-        if not hasattr(cls, registry_name):
-            raise ValueError(f"Unknown registry: '{registry_name}'")
-        reg = getattr(cls, registry_name)
-        func = reg.get(func_name)
-        if func is None:
-            raise ValueError(f"Could not find '{func_name}' in '{registry_name}'")
-        return func
-
-    @classmethod
-    def resolve(
-        cls,
-        config: Union[Config, Dict[str, Dict[str, Any]]],
-        *,
-        schema: Type[BaseModel] = EmptySchema,
-        overrides: Dict[str, Any] = {},
-        validate: bool = True,
-    ) -> Dict[str, Any]:
-        resolved, _ = cls._make(
-            config, schema=schema, overrides=overrides, validate=validate, resolve=True
-        )
-        return resolved
-
-    @classmethod
-    def fill(
-        cls,
-        config: Union[Config, Dict[str, Dict[str, Any]]],
-        *,
-        schema: Type[BaseModel] = EmptySchema,
-        overrides: Dict[str, Any] = {},
-        validate: bool = True,
-    ):
-        _, filled = cls._make(
-            config, schema=schema, overrides=overrides, validate=validate, resolve=False
-        )
-        return filled
-
-    @classmethod
-    def _make(
-        cls,
-        config: Union[Config, Dict[str, Dict[str, Any]]],
-        *,
-        schema: Type[BaseModel] = EmptySchema,
-        overrides: Dict[str, Any] = {},
-        resolve: bool = True,
-        validate: bool = True,
-    ) -> Tuple[Dict[str, Any], Config]:
-        """Unpack a config dictionary and create two versions of the config:
-        a resolved version with objects from the registry created recursively,
-        and a filled version with all references to registry functions left
-        intact, but filled with all values and defaults based on the type
-        annotations. If validate=True, the config will be validated against the
-        type annotations of the registered functions referenced in the config
-        (if available) and/or the schema (if available).
-        """
-        # Valid: {"optimizer": {"@optimizers": "my_cool_optimizer", "rate": 1.0}}
-        # Invalid: {"@optimizers": "my_cool_optimizer", "rate": 1.0}
-        if cls.is_promise(config):
-            err_msg = "The top-level config object can't be a reference to a registered function."
-            raise ConfigValidationError(config=config, errors=[{"msg": err_msg}])
-        # If a Config was loaded with interpolate=False, we assume it needs to
-        # be interpolated first, otherwise we take it at face value
-        is_interpolated = not isinstance(config, Config) or config.is_interpolated
-        section_order = config.section_order if isinstance(config, Config) else None
-        orig_config = config
-        if not is_interpolated:
-            config = Config(orig_config).interpolate()
-        filled, _, resolved = cls._fill(
-            config, schema, validate=validate, overrides=overrides, resolve=resolve
-        )
-        filled = Config(filled, section_order=section_order)
-        # Check that overrides didn't include invalid properties not in config
-        if validate:
-            cls._validate_overrides(filled, overrides)
-        # Merge the original config back to preserve variables if we started
-        # with a config that wasn't interpolated. Here, we prefer variables to
-        # allow auto-filling a non-interpolated config without destroying
-        # variable references.
-        if not is_interpolated:
-            filled = filled.merge(
-                Config(orig_config, is_interpolated=False), remove_extra=True
-            )
-        return dict(resolved), filled
-
-    @classmethod
-    def _fill(
-        cls,
-        config: Union[Config, Dict[str, Dict[str, Any]]],
-        schema: Type[BaseModel] = EmptySchema,
-        *,
-        validate: bool = True,
-        resolve: bool = True,
-        parent: str = "",
-        overrides: Dict[str, Dict[str, Any]] = {},
-    ) -> Tuple[
-        Union[Dict[str, Any], Config], Union[Dict[str, Any], Config], Dict[str, Any]
-    ]:
-        """Build three representations of the config:
-        1. All promises are preserved (just like config user would provide).
-        2. Promises are replaced by their return values. This is the validation
-           copy and will be parsed by pydantic. It lets us include hacks to
-           work around problems (e.g. handling of generators).
-        3. Final copy with promises replaced by their return values.
-        """
-        filled: Dict[str, Any] = {}
-        validation: Dict[str, Any] = {}
-        final: Dict[str, Any] = {}
-        for key, value in config.items():
-            # If the field name is reserved, we use its alias for validation
-            v_key = RESERVED_FIELDS.get(key, key)
-            key_parent = f"{parent}.{key}".strip(".")
-            if key_parent in overrides:
-                value = overrides[key_parent]
-                config[key] = value
-            if cls.is_promise(value):
-                if key in schema.__fields__ and not resolve:
-                    # If we're not resolving the config, make sure that the field
-                    # expecting the promise is typed Any so it doesn't fail
-                    # validation if it doesn't receive the function return value
-                    field = schema.__fields__[key]
-                    schema.__fields__[key] = copy_model_field(field, Any)
-                promise_schema = cls.make_promise_schema(value, resolve=resolve)
-                filled[key], validation[v_key], final[key] = cls._fill(
-                    value,
-                    promise_schema,
-                    validate=validate,
-                    resolve=resolve,
-                    parent=key_parent,
-                    overrides=overrides,
-                )
-                reg_name, func_name = cls.get_constructor(final[key])
-                args, kwargs = cls.parse_args(final[key])
-                if resolve:
-                    # Call the function and populate the field value. We can't
-                    # just create an instance of the type here, since this
-                    # wouldn't work for generics / more complex custom types
-                    getter = cls.get(reg_name, func_name)
-                    # We don't want to try/except this and raise our own error
-                    # here, because we want the traceback if the function fails.
-                    getter_result = getter(*args, **kwargs)
-                else:
-                    # We're not resolving and calling the function, so replace
-                    # the getter_result with a Promise class
-                    getter_result = Promise(
-                        registry=reg_name, name=func_name, args=args, kwargs=kwargs
-                    )
-                validation[v_key] = getter_result
-                final[key] = getter_result
-                if isinstance(validation[v_key], GeneratorType):
-                    # If value is a generator we can't validate type without
-                    # consuming it (which doesn't work if it's infinite – see
-                    # schedule for examples). So we skip it.
-                    validation[v_key] = []
-            elif hasattr(value, "items"):
-                field_type = EmptySchema
-                if key in schema.__fields__:
-                    field = schema.__fields__[key]
-                    field_type = field.type_
-                    if not isinstance(field.type_, ModelMetaclass):
-                        # If we don't have a pydantic schema and just a type
-                        field_type = EmptySchema
-                filled[key], validation[v_key], final[key] = cls._fill(
-                    value,
-                    field_type,
-                    validate=validate,
-                    resolve=resolve,
-                    parent=key_parent,
-                    overrides=overrides,
-                )
-                if key == ARGS_FIELD and isinstance(validation[v_key], dict):
-                    # If the value of variable positional args is a dict (e.g.
-                    # created via config blocks), only use its values
-                    validation[v_key] = list(validation[v_key].values())
-                    final[key] = list(final[key].values())
-            else:
-                filled[key] = value
-                # Prevent pydantic from consuming generator if part of a union
-                validation[v_key] = (
-                    value if not isinstance(value, GeneratorType) else []
-                )
-                final[key] = value
-        # Now that we've filled in all of the promises, update with defaults
-        # from schema, and validate if validation is enabled
-        exclude = []
-        if validate:
-            try:
-                result = schema.parse_obj(validation)
-            except ValidationError as e:
-                raise ConfigValidationError(
-                    config=config, errors=e.errors(), parent=parent
-                ) from None
-        else:
-            # Same as parse_obj, but without validation
-            result = schema.construct(**validation)
-            # If our schema doesn't allow extra values, we need to filter them
-            # manually because .construct doesn't parse anything
-            if schema.Config.extra in (Extra.forbid, Extra.ignore):
-                fields = schema.__fields__.keys()
-                exclude = [k for k in result.__fields_set__ if k not in fields]
-        exclude_validation = set([ARGS_FIELD_ALIAS, *RESERVED_FIELDS.keys()])
-        validation.update(result.dict(exclude=exclude_validation))
-        filled, final = cls._update_from_parsed(validation, filled, final)
-        if exclude:
-            filled = {k: v for k, v in filled.items() if k not in exclude}
-            validation = {k: v for k, v in validation.items() if k not in exclude}
-            final = {k: v for k, v in final.items() if k not in exclude}
-        return filled, validation, final
-
-    @classmethod
-    def _update_from_parsed(
-        cls, validation: Dict[str, Any], filled: Dict[str, Any], final: Dict[str, Any]
-    ):
-        """Update the final result with the parsed config like converted
-        values recursively.
-        """
-        for key, value in validation.items():
-            if key in RESERVED_FIELDS.values():
-                continue  # skip aliases for reserved fields
-            if key not in filled:
-                filled[key] = value
-            if key not in final:
-                final[key] = value
-            if isinstance(value, dict):
-                filled[key], final[key] = cls._update_from_parsed(
-                    value, filled[key], final[key]
-                )
-            # Update final config with parsed value if they're not equal (in
-            # value and in type) but not if it's a generator because we had to
-            # replace that to validate it correctly
-            elif key == ARGS_FIELD:
-                continue  # don't substitute if list of positional args
-            elif isinstance(value, numpy.ndarray):  # check numpy first, just in case
-                final[key] = value
-            elif (
-                value != final[key] or not isinstance(type(value), type(final[key]))
-            ) and not isinstance(final[key], GeneratorType):
-                final[key] = value
-        return filled, final
-
-    @classmethod
-    def _validate_overrides(cls, filled: Config, overrides: Dict[str, Any]):
-        """Validate overrides against a filled config to make sure there are
-        no references to properties that don't exist and weren't used."""
-        error_msg = "Invalid override: config value doesn't exist"
-        errors = []
-        for override_key in overrides.keys():
-            if not cls._is_in_config(override_key, filled):
-                errors.append({"msg": error_msg, "loc": [override_key]})
-        if errors:
-            raise ConfigValidationError(config=filled, errors=errors)
-
-    @classmethod
-    def _is_in_config(cls, prop: str, config: Union[Dict[str, Any], Config]):
-        """Check whether a nested config property like "section.subsection.key"
-        is in a given config."""
-        tree = prop.split(".")
-        obj = dict(config)
-        while tree:
-            key = tree.pop(0)
-            if isinstance(obj, dict) and key in obj:
-                obj = obj[key]
-            else:
-                return False
-        return True
-
-    @classmethod
-    def is_promise(cls, obj: Any) -> bool:
-        """Check whether an object is a "promise", i.e. contains a reference
-        to a registered function (via a key starting with `"@"`.
-        """
-        if not hasattr(obj, "keys"):
-            return False
-        id_keys = [k for k in obj.keys() if k.startswith("@")]
-        if len(id_keys):
-            return True
-        return False
-
-    @classmethod
-    def get_constructor(cls, obj: Dict[str, Any]) -> Tuple[str, str]:
-        id_keys = [k for k in obj.keys() if k.startswith("@")]
-        if len(id_keys) != 1:
-            err_msg = f"A block can only contain one function registry reference. Got: {id_keys}"
-            raise ConfigValidationError(config=obj, errors=[{"msg": err_msg}])
-        else:
-            key = id_keys[0]
-            value = obj[key]
-            return (key[1:], value)
-
-    @classmethod
-    def parse_args(cls, obj: Dict[str, Any]) -> Tuple[List[Any], Dict[str, Any]]:
-        args = []
-        kwargs = {}
-        for key, value in obj.items():
-            if not key.startswith("@"):
-                if key == ARGS_FIELD:
-                    args = value
-                elif key in RESERVED_FIELDS.values():
-                    continue
-                else:
-                    kwargs[key] = value
-        return args, kwargs
-
-    @classmethod
-    def make_promise_schema(
-        cls, obj: Dict[str, Any], *, resolve: bool = True
-    ) -> Type[BaseModel]:
-        """Create a schema for a promise dict (referencing a registry function)
-        by inspecting the function signature.
-        """
-        reg_name, func_name = cls.get_constructor(obj)
-        if not resolve and not cls.has(reg_name, func_name):
-            return EmptySchema
-        func = cls.get(reg_name, func_name)
-        # Read the argument annotations and defaults from the function signature
-        id_keys = [k for k in obj.keys() if k.startswith("@")]
-        sig_args: Dict[str, Any] = {id_keys[0]: (str, ...)}
-        for param in inspect.signature(func).parameters.values():
-            # If no annotation is specified assume it's anything
-            annotation = param.annotation if param.annotation != param.empty else Any
-            # If no default value is specified assume that it's required
-            default = param.default if param.default != param.empty else ...
-            # Handle spread arguments and use their annotation as Sequence[whatever]
-            if param.kind == param.VAR_POSITIONAL:
-                spread_annot = Sequence[annotation]  # type: ignore
-                sig_args[ARGS_FIELD_ALIAS] = (spread_annot, default)
-            else:
-                name = RESERVED_FIELDS.get(param.name, param.name)
-                sig_args[name] = (annotation, default)
-        sig_args["__config__"] = _PromiseSchemaConfig
-        return create_model("ArgModel", **sig_args)
-
 
 __all__ = ["Config", "registry", "ConfigValidationError"]
diff --git a/thinc/tests/test_config.py b/thinc/tests/test_config.py
index ddd05ca96..fcba87f4b 100644
--- a/thinc/tests/test_config.py
+++ b/thinc/tests/test_config.py
@@ -135,360 +135,6 @@ def catsie_v2(evil: StrictBool, cute: bool = True, cute_level: int = 1) -> str:
 worst_catsie = {"@cats": "catsie.v1", "evil": True, "cute": False}
 
 
-def test_validate_simple_config():
-    simple_config = {"hello": 1, "world": 2}
-    f, _, v = my_registry._fill(simple_config, HelloIntsSchema)
-    assert f == simple_config
-    assert v == simple_config
-
-
-def test_invalidate_simple_config():
-    invalid_config = {"hello": 1, "world": "hi!"}
-    with pytest.raises(ConfigValidationError) as exc_info:
-        my_registry._fill(invalid_config, HelloIntsSchema)
-    error = exc_info.value
-    assert len(error.errors) == 1
-    assert "type_error.integer" in error.error_types
-
-
-def test_invalidate_extra_args():
-    invalid_config = {"hello": 1, "world": 2, "extra": 3}
-    with pytest.raises(ConfigValidationError):
-        my_registry._fill(invalid_config, HelloIntsSchema)
-
-
-def test_fill_defaults_simple_config():
-    valid_config = {"required": 1}
-    filled, _, v = my_registry._fill(valid_config, DefaultsSchema)
-    assert filled["required"] == 1
-    assert filled["optional"] == "default value"
-    invalid_config = {"optional": "some value"}
-    with pytest.raises(ConfigValidationError):
-        my_registry._fill(invalid_config, DefaultsSchema)
-
-
-def test_fill_recursive_config():
-    valid_config = {"outer_req": 1, "level2_req": {"hello": 4, "world": 7}}
-    filled, _, validation = my_registry._fill(valid_config, ComplexSchema)
-    assert filled["outer_req"] == 1
-    assert filled["outer_opt"] == "default value"
-    assert filled["level2_req"]["hello"] == 4
-    assert filled["level2_req"]["world"] == 7
-    assert filled["level2_opt"]["required"] == 1
-    assert filled["level2_opt"]["optional"] == "default value"
-
-
-def test_is_promise():
-    assert my_registry.is_promise(good_catsie)
-    assert not my_registry.is_promise({"hello": "world"})
-    assert not my_registry.is_promise(1)
-    invalid = {"@complex": "complex.v1", "rate": 1.0, "@cats": "catsie.v1"}
-    assert my_registry.is_promise(invalid)
-
-
-def test_get_constructor():
-    my_registry.get_constructor(good_catsie) == ("cats", "catsie.v1")
-
-
-def test_parse_args():
-    args, kwargs = my_registry.parse_args(bad_catsie)
-    assert args == []
-    assert kwargs == {"evil": True, "cute": True}
-
-
-def test_make_promise_schema():
-    schema = my_registry.make_promise_schema(good_catsie)
-    assert "evil" in schema.__fields__
-    assert "cute" in schema.__fields__
-
-
-def test_validate_promise():
-    config = {"required": 1, "optional": good_catsie}
-    filled, _, validated = my_registry._fill(config, DefaultsSchema)
-    assert filled == config
-    assert validated == {"required": 1, "optional": "meow"}
-
-
-def test_fill_validate_promise():
-    config = {"required": 1, "optional": {"@cats": "catsie.v1", "evil": False}}
-    filled, _, validated = my_registry._fill(config, DefaultsSchema)
-    assert filled["optional"]["cute"] is True
-
-
-def test_fill_invalidate_promise():
-    config = {"required": 1, "optional": {"@cats": "catsie.v1", "evil": False}}
-    with pytest.raises(ConfigValidationError):
-        my_registry._fill(config, HelloIntsSchema)
-    config["optional"]["whiskers"] = True
-    with pytest.raises(ConfigValidationError):
-        my_registry._fill(config, DefaultsSchema)
-
-
-def test_create_registry():
-    with pytest.raises(ValueError):
-        my_registry.create("cats")
-    my_registry.create("dogs")
-    assert hasattr(my_registry, "dogs")
-    assert len(my_registry.dogs.get_all()) == 0
-    my_registry.dogs.register("good_boy.v1", func=lambda x: x)
-    assert len(my_registry.dogs.get_all()) == 1
-    with pytest.raises(ValueError):
-        my_registry.create("dogs")
-
-
-def test_registry_methods():
-    with pytest.raises(ValueError):
-        my_registry.get("dfkoofkds", "catsie.v1")
-    my_registry.cats.register("catsie.v123")(None)
-    with pytest.raises(ValueError):
-        my_registry.get("cats", "catsie.v123")
-
-
-def test_resolve_no_schema():
-    config = {"one": 1, "two": {"three": {"@cats": "catsie.v1", "evil": True}}}
-    result = my_registry.resolve({"cfg": config})["cfg"]
-    assert result["one"] == 1
-    assert result["two"] == {"three": "scratch!"}
-    with pytest.raises(ConfigValidationError):
-        config = {"two": {"three": {"@cats": "catsie.v1", "evil": "true"}}}
-        my_registry.resolve(config)
-
-
-def test_resolve_schema():
-    class TestBaseSubSchema(BaseModel):
-        three: str
-
-    class TestBaseSchema(BaseModel):
-        one: PositiveInt
-        two: TestBaseSubSchema
-
-        class Config:
-            extra = "forbid"
-
-    class TestSchema(BaseModel):
-        cfg: TestBaseSchema
-
-    config = {"one": 1, "two": {"three": {"@cats": "catsie.v1", "evil": True}}}
-    my_registry.resolve({"cfg": config}, schema=TestSchema)
-    config = {"one": -1, "two": {"three": {"@cats": "catsie.v1", "evil": True}}}
-    with pytest.raises(ConfigValidationError):
-        # "one" is not a positive int
-        my_registry.resolve({"cfg": config}, schema=TestSchema)
-    config = {"one": 1, "two": {"four": {"@cats": "catsie.v1", "evil": True}}}
-    with pytest.raises(ConfigValidationError):
-        # "three" is required in subschema
-        my_registry.resolve({"cfg": config}, schema=TestSchema)
-
-
-def test_resolve_schema_coerced():
-    class TestBaseSchema(BaseModel):
-        test1: str
-        test2: bool
-        test3: float
-
-    class TestSchema(BaseModel):
-        cfg: TestBaseSchema
-
-    config = {"test1": 123, "test2": 1, "test3": 5}
-    filled = my_registry.fill({"cfg": config}, schema=TestSchema)
-    result = my_registry.resolve({"cfg": config}, schema=TestSchema)
-    assert result["cfg"] == {"test1": "123", "test2": True, "test3": 5.0}
-    # This only affects the resolved config, not the filled config
-    assert filled["cfg"] == config
-
-
-def test_read_config():
-    byte_string = EXAMPLE_CONFIG.encode("utf8")
-    cfg = Config().from_bytes(byte_string)
-
-    assert cfg["optimizer"]["beta1"] == 0.9
-    assert cfg["optimizer"]["learn_rate"]["initial_rate"] == 0.1
-    assert cfg["pipeline"]["parser"]["factory"] == "parser"
-    assert cfg["pipeline"]["parser"]["model"]["tok2vec"]["width"] == 128
-
-
-def test_optimizer_config():
-    cfg = Config().from_str(OPTIMIZER_CFG)
-    optimizer = my_registry.resolve(cfg, validate=True)["optimizer"]
-    assert optimizer.b1 == 0.9
-
-
-def test_config_to_str():
-    cfg = Config().from_str(OPTIMIZER_CFG)
-    assert cfg.to_str().strip() == OPTIMIZER_CFG.strip()
-    cfg = Config({"optimizer": {"foo": "bar"}}).from_str(OPTIMIZER_CFG)
-    assert cfg.to_str().strip() == OPTIMIZER_CFG.strip()
-
-
-def test_config_to_str_creates_intermediate_blocks():
-    cfg = Config({"optimizer": {"foo": {"bar": 1}}})
-    assert (
-        cfg.to_str().strip()
-        == """
-[optimizer]
-
-[optimizer.foo]
-bar = 1
-    """.strip()
-    )
-
-
-def test_config_roundtrip_bytes():
-    cfg = Config().from_str(OPTIMIZER_CFG)
-    cfg_bytes = cfg.to_bytes()
-    new_cfg = Config().from_bytes(cfg_bytes)
-    assert new_cfg.to_str().strip() == OPTIMIZER_CFG.strip()
-
-
-def test_config_roundtrip_disk():
-    cfg = Config().from_str(OPTIMIZER_CFG)
-    with make_tempdir() as path:
-        cfg_path = path / "config.cfg"
-        cfg.to_disk(cfg_path)
-        new_cfg = Config().from_disk(cfg_path)
-    assert new_cfg.to_str().strip() == OPTIMIZER_CFG.strip()
-
-
-def test_config_roundtrip_disk_respects_path_subclasses(pathy_fixture):
-    cfg = Config().from_str(OPTIMIZER_CFG)
-    cfg_path = pathy_fixture / "config.cfg"
-    cfg.to_disk(cfg_path)
-    new_cfg = Config().from_disk(cfg_path)
-    assert new_cfg.to_str().strip() == OPTIMIZER_CFG.strip()
-
-
-def test_config_to_str_invalid_defaults():
-    """Test that an error is raised if a config contains top-level keys without
-    a section that would otherwise be interpreted as [DEFAULT] (which causes
-    the values to be included in *all* other sections).
-    """
-    cfg = {"one": 1, "two": {"@cats": "catsie.v1", "evil": "hello"}}
-    with pytest.raises(ConfigValidationError):
-        Config(cfg).to_str()
-    config_str = "[DEFAULT]\none = 1"
-    with pytest.raises(ConfigValidationError):
-        Config().from_str(config_str)
-
-
-def test_validation_custom_types():
-    def complex_args(
-        rate: StrictFloat,
-        steps: PositiveInt = 10,  # type: ignore
-        log_level: constr(regex="(DEBUG|INFO|WARNING|ERROR)") = "ERROR",
-    ):
-        return None
-
-    my_registry.create("complex")
-    my_registry.complex("complex.v1")(complex_args)
-    cfg = {"@complex": "complex.v1", "rate": 1.0, "steps": 20, "log_level": "INFO"}
-    my_registry.resolve({"config": cfg})
-    cfg = {"@complex": "complex.v1", "rate": 1.0, "steps": -1, "log_level": "INFO"}
-    with pytest.raises(ConfigValidationError):
-        # steps is not a positive int
-        my_registry.resolve({"config": cfg})
-    cfg = {"@complex": "complex.v1", "rate": 1.0, "steps": 20, "log_level": "none"}
-    with pytest.raises(ConfigValidationError):
-        # log_level is not a string matching the regex
-        my_registry.resolve({"config": cfg})
-    cfg = {"@complex": "complex.v1", "rate": 1.0, "steps": 20, "log_level": "INFO"}
-    with pytest.raises(ConfigValidationError):
-        # top-level object is promise
-        my_registry.resolve(cfg)
-    with pytest.raises(ConfigValidationError):
-        # top-level object is promise
-        my_registry.fill(cfg)
-    cfg = {"@complex": "complex.v1", "rate": 1.0, "@cats": "catsie.v1"}
-    with pytest.raises(ConfigValidationError):
-        # two constructors
-        my_registry.resolve({"config": cfg})
-
-
-def test_validation_no_validate():
-    config = {"one": 1, "two": {"three": {"@cats": "catsie.v1", "evil": "false"}}}
-    result = my_registry.resolve({"cfg": config}, validate=False)
-    filled = my_registry.fill({"cfg": config}, validate=False)
-    assert result["cfg"]["one"] == 1
-    assert result["cfg"]["two"] == {"three": "scratch!"}
-    assert filled["cfg"]["two"]["three"]["evil"] == "false"
-    assert filled["cfg"]["two"]["three"]["cute"] is True
-
-
-def test_validation_fill_defaults():
-    config = {"cfg": {"one": 1, "two": {"@cats": "catsie.v1", "evil": "hello"}}}
-    result = my_registry.fill(config, validate=False)
-    assert len(result["cfg"]["two"]) == 3
-    with pytest.raises(ConfigValidationError):
-        # Required arg "evil" is not defined
-        my_registry.fill(config)
-    config = {"cfg": {"one": 1, "two": {"@cats": "catsie.v2", "evil": False}}}
-    # Fill in with new defaults
-    result = my_registry.fill(config)
-    assert len(result["cfg"]["two"]) == 4
-    assert result["cfg"]["two"]["evil"] is False
-    assert result["cfg"]["two"]["cute"] is True
-    assert result["cfg"]["two"]["cute_level"] == 1
-
-
-def test_make_config_positional_args():
-    @my_registry.cats("catsie.v567")
-    def catsie_567(*args: Optional[str], foo: str = "bar"):
-        assert args[0] == "^_^"
-        assert args[1] == "^(*.*)^"
-        assert foo == "baz"
-        return args[0]
-
-    args = ["^_^", "^(*.*)^"]
-    cfg = {"config": {"@cats": "catsie.v567", "foo": "baz", "*": args}}
-    assert my_registry.resolve(cfg)["config"] == "^_^"
-
-
-def test_make_config_positional_args_complex():
-    @my_registry.cats("catsie.v890")
-    def catsie_890(*args: Optional[Union[StrictBool, PositiveInt]]):
-        assert args[0] == 123
-        return args[0]
-
-    cfg = {"config": {"@cats": "catsie.v890", "*": [123, True, 1, False]}}
-    assert my_registry.resolve(cfg)["config"] == 123
-    cfg = {"config": {"@cats": "catsie.v890", "*": [123, "True"]}}
-    with pytest.raises(ConfigValidationError):
-        # "True" is not a valid boolean or positive int
-        my_registry.resolve(cfg)
-
-
-def test_positional_args_to_from_string():
-    cfg = """[a]\nb = 1\n* = ["foo","bar"]"""
-    assert Config().from_str(cfg).to_str() == cfg
-    cfg = """[a]\nb = 1\n\n[a.*.bar]\ntest = 2\n\n[a.*.foo]\ntest = 1"""
-    assert Config().from_str(cfg).to_str() == cfg
-
-    @my_registry.cats("catsie.v666")
-    def catsie_666(*args, meow=False):
-        return args
-
-    cfg = """[a]\n@cats = "catsie.v666"\n* = ["foo","bar"]"""
-    filled = my_registry.fill(Config().from_str(cfg)).to_str()
-    assert filled == """[a]\n@cats = "catsie.v666"\n* = ["foo","bar"]\nmeow = false"""
-    resolved = my_registry.resolve(Config().from_str(cfg))
-    assert resolved == {"a": ("foo", "bar")}
-    cfg = """[a]\n@cats = "catsie.v666"\n\n[a.*.foo]\nx = 1"""
-    filled = my_registry.fill(Config().from_str(cfg)).to_str()
-    assert filled == """[a]\n@cats = "catsie.v666"\nmeow = false\n\n[a.*.foo]\nx = 1"""
-    resolved = my_registry.resolve(Config().from_str(cfg))
-    assert resolved == {"a": ({"x": 1},)}
-
-    @my_registry.cats("catsie.v777")
-    def catsie_777(y: int = 1):
-        return "meow" * y
-
-    cfg = """[a]\n@cats = "catsie.v666"\n\n[a.*.foo]\n@cats = "catsie.v777\""""
-    filled = my_registry.fill(Config().from_str(cfg)).to_str()
-    expected = """[a]\n@cats = "catsie.v666"\nmeow = false\n\n[a.*.foo]\n@cats = "catsie.v777"\ny = 1"""
-    assert filled == expected
-    cfg = """[a]\n@cats = "catsie.v666"\n\n[a.*.foo]\n@cats = "catsie.v777"\ny = 3"""
-    result = my_registry.resolve(Config().from_str(cfg))
-    assert result == {"a": ("meowmeowmeow",)}
-
 
 def test_make_config_positional_args_dicts():
     cfg = {
@@ -511,51 +157,6 @@ def test_make_config_positional_args_dicts():
     model.finish_update(resolved["optimizer"])
 
 
-def test_validation_generators_iterable():
-    @my_registry.optimizers("test_optimizer.v1")
-    def test_optimizer_v1(rate: float) -> None:
-        return None
-
-    @my_registry.schedules("test_schedule.v1")
-    def test_schedule_v1(some_value: float = 1.0) -> Iterable[float]:
-        while True:
-            yield some_value
-
-    config = {"optimizer": {"@optimizers": "test_optimizer.v1", "rate": 0.1}}
-    my_registry.resolve(config)
-
-
-def test_validation_unset_type_hints():
-    """Test that unset type hints are handled correctly (and treated as Any)."""
-
-    @my_registry.optimizers("test_optimizer.v2")
-    def test_optimizer_v2(rate, steps: int = 10) -> None:
-        return None
-
-    config = {"test": {"@optimizers": "test_optimizer.v2", "rate": 0.1, "steps": 20}}
-    my_registry.resolve(config)
-
-
-def test_validation_bad_function():
-    @my_registry.optimizers("bad.v1")
-    def bad() -> None:
-        raise ValueError("This is an error in the function")
-        return None
-
-    @my_registry.optimizers("good.v1")
-    def good() -> None:
-        return None
-
-    # Bad function
-    config = {"test": {"@optimizers": "bad.v1"}}
-    with pytest.raises(ValueError):
-        my_registry.resolve(config)
-    # Bad function call
-    config = {"test": {"@optimizers": "good.v1", "invalid_arg": 1}}
-    with pytest.raises(ConfigValidationError):
-        my_registry.resolve(config)
-
-
 def test_objects_from_config():
     config = {
         "optimizer": {
@@ -583,93 +184,6 @@ def decaying(base_rate: float, repeat: int) -> List[float]:
     assert optimizer.learn_rate == 0.001
 
 
-def test_partials_from_config():
-    """Test that functions registered with partial applications are handled
-    correctly (e.g. initializers)."""
-    name = "uniform_init.v1"
-    cfg = {"test": {"@initializers": name, "lo": -0.2}}
-    func = my_registry.resolve(cfg)["test"]
-    assert hasattr(func, "__call__")
-    # The partial will still have lo as an arg, just with default
-    assert len(inspect.signature(func).parameters) == 4
-    # Make sure returned partial function has correct value set
-    assert inspect.signature(func).parameters["lo"].default == -0.2
-    # Actually call the function and verify
-    func(NumpyOps(), (2, 3))
-    # Make sure validation still works
-    bad_cfg = {"test": {"@initializers": name, "lo": [0.5]}}
-    with pytest.raises(ConfigValidationError):
-        my_registry.resolve(bad_cfg)
-    bad_cfg = {"test": {"@initializers": name, "lo": -0.2, "other": 10}}
-    with pytest.raises(ConfigValidationError):
-        my_registry.resolve(bad_cfg)
-
-
-def test_partials_from_config_nested():
-    """Test that partial functions are passed correctly to other registered
-    functions that consume them (e.g. initializers -> layers)."""
-
-    def test_initializer(a: int, b: int = 1) -> int:
-        return a * b
-
-    @my_registry.initializers("test_initializer.v1")
-    def configure_test_initializer(b: int = 1) -> Callable[[int], int]:
-        return partial(test_initializer, b=b)
-
-    @my_registry.layers("test_layer.v1")
-    def test_layer(init: Callable[[int], int], c: int = 1) -> Callable[[int], int]:
-        return lambda x: x + init(c)
-
-    cfg = {
-        "@layers": "test_layer.v1",
-        "c": 5,
-        "init": {"@initializers": "test_initializer.v1", "b": 10},
-    }
-    func = my_registry.resolve({"test": cfg})["test"]
-    assert func(1) == 51
-    assert func(100) == 150
-
-
-def test_validate_generator():
-    """Test that generator replacement for validation in config doesn't
-    actually replace the returned value."""
-
-    @my_registry.schedules("test_schedule.v2")
-    def test_schedule():
-        while True:
-            yield 10
-
-    cfg = {"@schedules": "test_schedule.v2"}
-    result = my_registry.resolve({"test": cfg})["test"]
-    assert isinstance(result, GeneratorType)
-
-    @my_registry.optimizers("test_optimizer.v2")
-    def test_optimizer2(rate: Generator) -> Generator:
-        return rate
-
-    cfg = {
-        "@optimizers": "test_optimizer.v2",
-        "rate": {"@schedules": "test_schedule.v2"},
-    }
-    result = my_registry.resolve({"test": cfg})["test"]
-    assert isinstance(result, GeneratorType)
-
-    @my_registry.optimizers("test_optimizer.v3")
-    def test_optimizer3(schedules: Dict[str, Generator]) -> Generator:
-        return schedules["rate"]
-
-    cfg = {
-        "@optimizers": "test_optimizer.v3",
-        "schedules": {"rate": {"@schedules": "test_schedule.v2"}},
-    }
-    result = my_registry.resolve({"test": cfg})["test"]
-    assert isinstance(result, GeneratorType)
-
-    @my_registry.optimizers("test_optimizer.v4")
-    def test_optimizer4(*schedules: Generator) -> Generator:
-        return schedules[0]
-
-
 def test_handle_generic_model_type():
     """Test that validation can handle checks against arbitrary generic
     types in function argument annotations."""
@@ -685,760 +199,6 @@ def my_transform(model: Model[int, int]):
     assert model.name == "transformed_model"
 
 
-@pytest.mark.parametrize(
-    "cfg",
-    [
-        "[a]\nb = 1\nc = 2\n\n[a.c]\nd = 3",
-        "[a]\nb = 1\n\n[a.c]\nd = 2\n\n[a.c.d]\ne = 3",
-    ],
-)
-def test_handle_error_duplicate_keys(cfg):
-    """This would cause very cryptic error when interpreting config.
-    (TypeError: 'X' object does not support item assignment)
-    """
-    with pytest.raises(ConfigValidationError):
-        Config().from_str(cfg)
-
-
-@pytest.mark.parametrize(
-    "cfg,is_valid",
-    [("[a]\nb = 1\n\n[a.c]\nd = 3", True), ("[a]\nb = 1\n\n[A.c]\nd = 2", False)],
-)
-def test_cant_expand_undefined_block(cfg, is_valid):
-    """Test that you can't expand a block that hasn't been created yet. This
-    comes up when you typo a name, and if we allow expansion of undefined blocks,
-    it's very hard to create good errors for those typos.
-    """
-    if is_valid:
-        Config().from_str(cfg)
-    else:
-        with pytest.raises(ConfigValidationError):
-            Config().from_str(cfg)
-
-
-def test_fill_config_overrides():
-    config = {
-        "cfg": {
-            "one": 1,
-            "two": {"three": {"@cats": "catsie.v1", "evil": True, "cute": False}},
-        }
-    }
-    overrides = {"cfg.two.three.evil": False}
-    result = my_registry.fill(config, overrides=overrides, validate=True)
-    assert result["cfg"]["two"]["three"]["evil"] is False
-    # Test that promises can be overwritten as well
-    overrides = {"cfg.two.three": 3}
-    result = my_registry.fill(config, overrides=overrides, validate=True)
-    assert result["cfg"]["two"]["three"] == 3
-    # Test that value can be overwritten with promises and that the result is
-    # interpreted and filled correctly
-    overrides = {"cfg": {"one": {"@cats": "catsie.v1", "evil": False}, "two": None}}
-    result = my_registry.fill(config, overrides=overrides)
-    assert result["cfg"]["two"] is None
-    assert result["cfg"]["one"]["@cats"] == "catsie.v1"
-    assert result["cfg"]["one"]["evil"] is False
-    assert result["cfg"]["one"]["cute"] is True
-    # Overwriting with wrong types should cause validation error
-    with pytest.raises(ConfigValidationError):
-        overrides = {"cfg.two.three.evil": 20}
-        my_registry.fill(config, overrides=overrides, validate=True)
-    # Overwriting with incomplete promises should cause validation error
-    with pytest.raises(ConfigValidationError):
-        overrides = {"cfg": {"one": {"@cats": "catsie.v1"}, "two": None}}
-        my_registry.fill(config, overrides=overrides)
-    # Overrides that don't match config should raise error
-    with pytest.raises(ConfigValidationError):
-        overrides = {"cfg.two.three.evil": False, "two.four": True}
-        my_registry.fill(config, overrides=overrides, validate=True)
-    with pytest.raises(ConfigValidationError):
-        overrides = {"cfg.five": False}
-        my_registry.fill(config, overrides=overrides, validate=True)
-
-
-def test_resolve_overrides():
-    config = {
-        "cfg": {
-            "one": 1,
-            "two": {"three": {"@cats": "catsie.v1", "evil": True, "cute": False}},
-        }
-    }
-    overrides = {"cfg.two.three.evil": False}
-    result = my_registry.resolve(config, overrides=overrides, validate=True)
-    assert result["cfg"]["two"]["three"] == "meow"
-    # Test that promises can be overwritten as well
-    overrides = {"cfg.two.three": 3}
-    result = my_registry.resolve(config, overrides=overrides, validate=True)
-    assert result["cfg"]["two"]["three"] == 3
-    # Test that value can be overwritten with promises
-    overrides = {"cfg": {"one": {"@cats": "catsie.v1", "evil": False}, "two": None}}
-    result = my_registry.resolve(config, overrides=overrides)
-    assert result["cfg"]["one"] == "meow"
-    assert result["cfg"]["two"] is None
-    # Overwriting with wrong types should cause validation error
-    with pytest.raises(ConfigValidationError):
-        overrides = {"cfg.two.three.evil": 20}
-        my_registry.resolve(config, overrides=overrides, validate=True)
-    # Overwriting with incomplete promises should cause validation error
-    with pytest.raises(ConfigValidationError):
-        overrides = {"cfg": {"one": {"@cats": "catsie.v1"}, "two": None}}
-        my_registry.resolve(config, overrides=overrides)
-    # Overrides that don't match config should raise error
-    with pytest.raises(ConfigValidationError):
-        overrides = {"cfg.two.three.evil": False, "cfg.two.four": True}
-        my_registry.resolve(config, overrides=overrides, validate=True)
-    with pytest.raises(ConfigValidationError):
-        overrides = {"cfg.five": False}
-        my_registry.resolve(config, overrides=overrides, validate=True)
-
-
-@pytest.mark.parametrize(
-    "prop,expected",
-    [("a.b.c", True), ("a.b", True), ("a", True), ("a.e", True), ("a.b.c.d", False)],
-)
-def test_is_in_config(prop, expected):
-    config = {"a": {"b": {"c": 5, "d": 6}, "e": [1, 2]}}
-    assert my_registry._is_in_config(prop, config) is expected
-
-
-def test_resolve_prefilled_values():
-    class Language(object):
-        def __init__(self):
-            ...
-
-    @my_registry.optimizers("prefilled.v1")
-    def prefilled(nlp: Language, value: int = 10):
-        return (nlp, value)
-
-    # Passing an instance of Language here via the config is bad, since it
-    # won't serialize to a string, but we still test for it
-    config = {"test": {"@optimizers": "prefilled.v1", "nlp": Language(), "value": 50}}
-    resolved = my_registry.resolve(config, validate=True)
-    result = resolved["test"]
-    assert isinstance(result[0], Language)
-    assert result[1] == 50
-
-
-def test_fill_config_dict_return_type():
-    """Test that a registered function returning a dict is handled correctly."""
-
-    @my_registry.cats.register("catsie_with_dict.v1")
-    def catsie_with_dict(evil: StrictBool) -> Dict[str, bool]:
-        return {"not_evil": not evil}
-
-    config = {"test": {"@cats": "catsie_with_dict.v1", "evil": False}, "foo": 10}
-    result = my_registry.fill({"cfg": config}, validate=True)["cfg"]["test"]
-    assert result["evil"] is False
-    assert "not_evil" not in result
-    result = my_registry.resolve({"cfg": config}, validate=True)["cfg"]["test"]
-    assert result["not_evil"] is True
-
-
-def test_deepcopy_config():
-    config = Config({"a": 1, "b": {"c": 2, "d": 3}})
-    copied = config.copy()
-    # Same values but not same object
-    assert config == copied
-    assert config is not copied
-    # Check for error if value can't be pickled/deepcopied
-    config = Config({"a": 1, "b": numpy})
-    with pytest.raises(ValueError):
-        config.copy()
-
-
-def test_config_to_str_simple_promises():
-    """Test that references to function registries without arguments are
-    serialized inline as dict."""
-    config_str = """[section]\nsubsection = {"@registry":"value"}"""
-    config = Config().from_str(config_str)
-    assert config["section"]["subsection"]["@registry"] == "value"
-    assert config.to_str() == config_str
-
-
-def test_config_from_str_invalid_section():
-    config_str = """[a]\nb = null\n\n[a.b]\nc = 1"""
-    with pytest.raises(ConfigValidationError):
-        Config().from_str(config_str)
-
-    config_str = """[a]\nb = null\n\n[a.b.c]\nd = 1"""
-    with pytest.raises(ConfigValidationError):
-        Config().from_str(config_str)
-
-
-def test_config_to_str_order():
-    """Test that Config.to_str orders the sections."""
-    config = {"a": {"b": {"c": 1, "d": 2}, "e": 3}, "f": {"g": {"h": {"i": 4, "j": 5}}}}
-    expected = (
-        "[a]\ne = 3\n\n[a.b]\nc = 1\nd = 2\n\n[f]\n\n[f.g]\n\n[f.g.h]\ni = 4\nj = 5"
-    )
-    config = Config(config)
-    assert config.to_str() == expected
-
-
-@pytest.mark.parametrize("d", [".", ":"])
-def test_config_interpolation(d):
-    """Test that config values are interpolated correctly. The parametrized
-    value is the final divider (${a.b} vs. ${a:b}). Both should now work and be
-    valid. The double {{ }} in the config strings are required to prevent the
-    references from being interpreted as an actual f-string variable.
-    """
-    c_str = """[a]\nfoo = "hello"\n\n[b]\nbar = ${foo}"""
-    with pytest.raises(ConfigValidationError):
-        Config().from_str(c_str)
-    c_str = f"""[a]\nfoo = "hello"\n\n[b]\nbar = ${{a{d}foo}}"""
-    assert Config().from_str(c_str)["b"]["bar"] == "hello"
-    c_str = f"""[a]\nfoo = "hello"\n\n[b]\nbar = ${{a{d}foo}}!"""
-    assert Config().from_str(c_str)["b"]["bar"] == "hello!"
-    c_str = f"""[a]\nfoo = "hello"\n\n[b]\nbar = "${{a{d}foo}}!\""""
-    assert Config().from_str(c_str)["b"]["bar"] == "hello!"
-    c_str = f"""[a]\nfoo = 15\n\n[b]\nbar = ${{a{d}foo}}!"""
-    assert Config().from_str(c_str)["b"]["bar"] == "15!"
-    c_str = f"""[a]\nfoo = ["x", "y"]\n\n[b]\nbar = ${{a{d}foo}}"""
-    assert Config().from_str(c_str)["b"]["bar"] == ["x", "y"]
-    # Interpolation within the same section
-    c_str = f"""[a]\nfoo = "x"\nbar = ${{a{d}foo}}\nbaz = "${{a{d}foo}}y\""""
-    assert Config().from_str(c_str)["a"]["bar"] == "x"
-    assert Config().from_str(c_str)["a"]["baz"] == "xy"
-
-
-def test_config_interpolation_lists():
-    # Test that lists are preserved correctly
-    c_str = """[a]\nb = 1\n\n[c]\nd = ["hello ${a.b}", "world"]"""
-    config = Config().from_str(c_str, interpolate=False)
-    assert config["c"]["d"] == ["hello ${a.b}", "world"]
-    config = config.interpolate()
-    assert config["c"]["d"] == ["hello 1", "world"]
-    c_str = """[a]\nb = 1\n\n[c]\nd = [${a.b}, "hello ${a.b}", "world"]"""
-    config = Config().from_str(c_str)
-    assert config["c"]["d"] == [1, "hello 1", "world"]
-    config = Config().from_str(c_str, interpolate=False)
-    # NOTE: This currently doesn't work, because we can't know how to JSON-load
-    # the uninterpolated list [${a.b}].
-    # assert config["c"]["d"] == ["${a.b}", "hello ${a.b}", "world"]
-    # config = config.interpolate()
-    # assert config["c"]["d"] == [1, "hello 1", "world"]
-    c_str = """[a]\nb = 1\n\n[c]\nd = ["hello", ${a}]"""
-    config = Config().from_str(c_str)
-    assert config["c"]["d"] == ["hello", {"b": 1}]
-    c_str = """[a]\nb = 1\n\n[c]\nd = ["hello", "hello ${a}"]"""
-    with pytest.raises(ConfigValidationError):
-        Config().from_str(c_str)
-    config_str = """[a]\nb = 1\n\n[c]\nd = ["hello", {"x": ["hello ${a.b}"], "y": 2}]"""
-    config = Config().from_str(config_str)
-    assert config["c"]["d"] == ["hello", {"x": ["hello 1"], "y": 2}]
-    config_str = """[a]\nb = 1\n\n[c]\nd = ["hello", {"x": [${a.b}], "y": 2}]"""
-    with pytest.raises(ConfigValidationError):
-        Config().from_str(c_str)
-
-
-@pytest.mark.parametrize("d", [".", ":"])
-def test_config_interpolation_sections(d):
-    """Test that config sections are interpolated correctly. The parametrized
-    value is the final divider (${a.b} vs. ${a:b}). Both should now work and be
-    valid. The double {{ }} in the config strings are required to prevent the
-    references from being interpreted as an actual f-string variable.
-    """
-    # Simple block references
-    c_str = """[a]\nfoo = "hello"\nbar = "world"\n\n[b]\nc = ${a}"""
-    config = Config().from_str(c_str)
-    assert config["b"]["c"] == config["a"]
-    # References with non-string values
-    c_str = f"""[a]\nfoo = "hello"\n\n[a.x]\ny = ${{a{d}b}}\n\n[a.b]\nc = 1\nd = [10]"""
-    config = Config().from_str(c_str)
-    assert config["a"]["x"]["y"] == config["a"]["b"]
-    # Multiple references in the same string
-    c_str = f"""[a]\nx = "string"\ny = 10\n\n[b]\nz = "${{a{d}x}}/${{a{d}y}}\""""
-    config = Config().from_str(c_str)
-    assert config["b"]["z"] == "string/10"
-    # Non-string references in string (converted to string)
-    c_str = f"""[a]\nx = ["hello", "world"]\n\n[b]\ny = "result: ${{a{d}x}}\""""
-    config = Config().from_str(c_str)
-    assert config["b"]["y"] == 'result: ["hello", "world"]'
-    # References to sections referencing sections
-    c_str = """[a]\nfoo = "x"\n\n[b]\nbar = ${a}\n\n[c]\nbaz = ${b}"""
-    config = Config().from_str(c_str)
-    assert config["b"]["bar"] == config["a"]
-    assert config["c"]["baz"] == config["b"]
-    # References to section values referencing other sections
-    c_str = f"""[a]\nfoo = "x"\n\n[b]\nbar = ${{a}}\n\n[c]\nbaz = ${{b{d}bar}}"""
-    config = Config().from_str(c_str)
-    assert config["c"]["baz"] == config["b"]["bar"]
-    # References to sections with subsections
-    c_str = """[a]\nfoo = "x"\n\n[a.b]\nbar = 100\n\n[c]\nbaz = ${a}"""
-    config = Config().from_str(c_str)
-    assert config["c"]["baz"] == config["a"]
-    # Infinite recursion
-    c_str = """[a]\nfoo ="x"\n\n[a.b]\nbar = ${a}"""
-    config = Config().from_str(c_str)
-    assert config["a"]["b"]["bar"] == config["a"]
-    c_str = f"""[a]\nfoo = "x"\n\n[b]\nbar = ${{a}}\n\n[c]\nbaz = ${{b.bar{d}foo}}"""
-    # We can't reference not-yet interpolated subsections
-    with pytest.raises(ConfigValidationError):
-        Config().from_str(c_str)
-    # Generally invalid references
-    c_str = f"""[a]\nfoo = ${{b{d}bar}}"""
-    with pytest.raises(ConfigValidationError):
-        Config().from_str(c_str)
-    # We can't reference sections or promises within strings
-    c_str = """[a]\n\n[a.b]\nfoo = "x: ${c}"\n\n[c]\nbar = 1\nbaz = 2"""
-    with pytest.raises(ConfigValidationError):
-        Config().from_str(c_str)
-
-
-def test_config_from_str_overrides():
-    config_str = """[a]\nb = 1\n\n[a.c]\nd = 2\ne = 3\n\n[f]\ng = {"x": "y"}"""
-    # Basic value substitution
-    overrides = {"a.b": 10, "a.c.d": 20}
-    config = Config().from_str(config_str, overrides=overrides)
-    assert config["a"]["b"] == 10
-    assert config["a"]["c"]["d"] == 20
-    assert config["a"]["c"]["e"] == 3
-    # Valid values that previously weren't in config
-    config = Config().from_str(config_str, overrides={"a.c.f": 100})
-    assert config["a"]["c"]["d"] == 2
-    assert config["a"]["c"]["e"] == 3
-    assert config["a"]["c"]["f"] == 100
-    # Invalid keys and sections
-    with pytest.raises(ConfigValidationError):
-        Config().from_str(config_str, overrides={"f": 10})
-    # This currently isn't expected to work, because the dict in f.g is not
-    # interpreted as a section while the config is still just the configparser
-    with pytest.raises(ConfigValidationError):
-        Config().from_str(config_str, overrides={"f.g.x": "z"})
-    # With variables (values)
-    config_str = """[a]\nb = 1\n\n[a.c]\nd = 2\ne = ${a:b}"""
-    config = Config().from_str(config_str, overrides={"a.b": 10})
-    assert config["a"]["b"] == 10
-    assert config["a"]["c"]["e"] == 10
-    # With variables (sections)
-    config_str = """[a]\nb = 1\n\n[a.c]\nd = 2\n[e]\nf = ${a.c}"""
-    config = Config().from_str(config_str, overrides={"a.c.d": 20})
-    assert config["a"]["c"]["d"] == 20
-    assert config["e"]["f"] == {"d": 20}
-
-
-def test_config_reserved_aliases():
-    """Test that the auto-generated pydantic schemas auto-alias reserved
-    attributes like "validate" that would otherwise cause NameError."""
-
-    @my_registry.cats("catsie.with_alias")
-    def catsie_with_alias(validate: StrictBool = False):
-        return validate
-
-    cfg = {"@cats": "catsie.with_alias", "validate": True}
-    resolved = my_registry.resolve({"test": cfg})
-    filled = my_registry.fill({"test": cfg})
-    assert resolved["test"] is True
-    assert filled["test"] == cfg
-    cfg = {"@cats": "catsie.with_alias", "validate": 20}
-    with pytest.raises(ConfigValidationError):
-        my_registry.resolve({"test": cfg})
-
-
-@pytest.mark.parametrize("d", [".", ":"])
-def test_config_no_interpolation(d):
-    """Test that interpolation is correctly preserved. The parametrized
-    value is the final divider (${a.b} vs. ${a:b}). Both should now work and be
-    valid. The double {{ }} in the config strings are required to prevent the
-    references from being interpreted as an actual f-string variable.
-    """
-    c_str = f"""[a]\nb = 1\n\n[c]\nd = ${{a{d}b}}\ne = \"hello${{a{d}b}}"\nf = ${{a}}"""
-    config = Config().from_str(c_str, interpolate=False)
-    assert not config.is_interpolated
-    assert config["c"]["d"] == f"${{a{d}b}}"
-    assert config["c"]["e"] == f'"hello${{a{d}b}}"'
-    assert config["c"]["f"] == "${a}"
-    config2 = Config().from_str(config.to_str(), interpolate=True)
-    assert config2.is_interpolated
-    assert config2["c"]["d"] == 1
-    assert config2["c"]["e"] == "hello1"
-    assert config2["c"]["f"] == {"b": 1}
-    config3 = config.interpolate()
-    assert config3.is_interpolated
-    assert config3["c"]["d"] == 1
-    assert config3["c"]["e"] == "hello1"
-    assert config3["c"]["f"] == {"b": 1}
-    # Bad non-serializable value
-    cfg = {"x": {"y": numpy.asarray([[1, 2], [4, 5]], dtype="f"), "z": f"${{x{d}y}}"}}
-    with pytest.raises(ConfigValidationError):
-        Config(cfg).interpolate()
-
-
-def test_config_no_interpolation_registry():
-    config_str = """[a]\nbad = true\n[b]\n@cats = "catsie.v1"\nevil = ${a:bad}\n\n[c]\n d = ${b}"""
-    config = Config().from_str(config_str, interpolate=False)
-    assert not config.is_interpolated
-    assert config["b"]["evil"] == "${a:bad}"
-    assert config["c"]["d"] == "${b}"
-    filled = my_registry.fill(config)
-    resolved = my_registry.resolve(config)
-    assert resolved["b"] == "scratch!"
-    assert resolved["c"]["d"] == "scratch!"
-    assert filled["b"]["evil"] == "${a:bad}"
-    assert filled["b"]["cute"] is True
-    assert filled["c"]["d"] == "${b}"
-    interpolated = filled.interpolate()
-    assert interpolated.is_interpolated
-    assert interpolated["b"]["evil"] is True
-    assert interpolated["c"]["d"] == interpolated["b"]
-    config = Config().from_str(config_str, interpolate=True)
-    assert config.is_interpolated
-    filled = my_registry.fill(config)
-    resolved = my_registry.resolve(config)
-    assert resolved["b"] == "scratch!"
-    assert resolved["c"]["d"] == "scratch!"
-    assert filled["b"]["evil"] is True
-    assert filled["c"]["d"] == filled["b"]
-    # Resolving a non-interpolated filled config
-    config = Config().from_str(config_str, interpolate=False)
-    assert not config.is_interpolated
-    filled = my_registry.fill(config)
-    assert not filled.is_interpolated
-    assert filled["c"]["d"] == "${b}"
-    resolved = my_registry.resolve(filled)
-    assert resolved["c"]["d"] == "scratch!"
-
-
-def test_config_deep_merge():
-    config = {"a": "hello", "b": {"c": "d"}}
-    defaults = {"a": "world", "b": {"c": "e", "f": "g"}}
-    merged = Config(defaults).merge(config)
-    assert len(merged) == 2
-    assert merged["a"] == "hello"
-    assert merged["b"] == {"c": "d", "f": "g"}
-    config = {"a": "hello", "b": {"@test": "x", "foo": 1}}
-    defaults = {"a": "world", "b": {"@test": "x", "foo": 100, "bar": 2}, "c": 100}
-    merged = Config(defaults).merge(config)
-    assert len(merged) == 3
-    assert merged["a"] == "hello"
-    assert merged["b"] == {"@test": "x", "foo": 1, "bar": 2}
-    assert merged["c"] == 100
-    config = {"a": "hello", "b": {"@test": "x", "foo": 1}, "c": 100}
-    defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}}
-    merged = Config(defaults).merge(config)
-    assert len(merged) == 3
-    assert merged["a"] == "hello"
-    assert merged["b"] == {"@test": "x", "foo": 1}
-    assert merged["c"] == 100
-    # Test that leaving out the factory just adds to existing
-    config = {"a": "hello", "b": {"foo": 1}, "c": 100}
-    defaults = {"a": "world", "b": {"@test": "y", "foo": 100, "bar": 2}}
-    merged = Config(defaults).merge(config)
-    assert len(merged) == 3
-    assert merged["a"] == "hello"
-    assert merged["b"] == {"@test": "y", "foo": 1, "bar": 2}
-    assert merged["c"] == 100
-    # Test that switching to a different factory prevents the default from being added
-    config = {"a": "hello", "b": {"@foo": 1}, "c": 100}
-    defaults = {"a": "world", "b": {"@bar": "y"}}
-    merged = Config(defaults).merge(config)
-    assert len(merged) == 3
-    assert merged["a"] == "hello"
-    assert merged["b"] == {"@foo": 1}
-    assert merged["c"] == 100
-    config = {"a": "hello", "b": {"@foo": 1}, "c": 100}
-    defaults = {"a": "world", "b": "y"}
-    merged = Config(defaults).merge(config)
-    assert len(merged) == 3
-    assert merged["a"] == "hello"
-    assert merged["b"] == {"@foo": 1}
-    assert merged["c"] == 100
-
-
-def test_config_deep_merge_variables():
-    config_str = """[a]\nb= 1\nc = 2\n\n[d]\ne = ${a:b}"""
-    defaults_str = """[a]\nx = 100\n\n[d]\ny = 500"""
-    config = Config().from_str(config_str, interpolate=False)
-    defaults = Config().from_str(defaults_str)
-    merged = defaults.merge(config)
-    assert merged["a"] == {"b": 1, "c": 2, "x": 100}
-    assert merged["d"] == {"e": "${a:b}", "y": 500}
-    assert merged.interpolate()["d"] == {"e": 1, "y": 500}
-    # With variable in defaults: overwritten by new value
-    config = Config().from_str("""[a]\nb= 1\nc = 2""")
-    defaults = Config().from_str("""[a]\nb = 100\nc = ${a:b}""", interpolate=False)
-    merged = defaults.merge(config)
-    assert merged["a"]["c"] == 2
-
-
-def test_config_to_str_roundtrip():
-    cfg = {"cfg": {"foo": False}}
-    config_str = Config(cfg).to_str()
-    assert config_str == "[cfg]\nfoo = false"
-    config = Config().from_str(config_str)
-    assert dict(config) == cfg
-    cfg = {"cfg": {"foo": "false"}}
-    config_str = Config(cfg).to_str()
-    assert config_str == '[cfg]\nfoo = "false"'
-    config = Config().from_str(config_str)
-    assert dict(config) == cfg
-    # Bad non-serializable value
-    cfg = {"cfg": {"x": numpy.asarray([[1, 2, 3, 4], [4, 5, 3, 4]], dtype="f")}}
-    config = Config(cfg)
-    with pytest.raises(ConfigValidationError):
-        config.to_str()
-    # Roundtrip with variables: preserve variables correctly (quoted/unquoted)
-    config_str = """[a]\nb = 1\n\n[c]\nd = ${a:b}\ne = \"hello${a:b}"\nf = "${a:b}\""""
-    config = Config().from_str(config_str, interpolate=False)
-    assert config.to_str() == config_str
-
-
-def test_config_is_interpolated():
-    """Test that a config object correctly reports whether it's interpolated."""
-    config_str = """[a]\nb = 1\n\n[c]\nd = ${a:b}\ne = \"hello${a:b}"\nf = ${a}"""
-    config = Config().from_str(config_str, interpolate=False)
-    assert not config.is_interpolated
-    config = config.merge(Config({"x": {"y": "z"}}))
-    assert not config.is_interpolated
-    config = Config(config)
-    assert not config.is_interpolated
-    config = config.interpolate()
-    assert config.is_interpolated
-    config = config.merge(Config().from_str(config_str, interpolate=False))
-    assert not config.is_interpolated
-
-
-@pytest.mark.parametrize(
-    "section_order,expected_str,expected_keys",
-    [
-        # fmt: off
-        ([], "[a]\nb = 1\nc = 2\n\n[a.d]\ne = 3\n\n[a.f]\ng = 4\n\n[h]\ni = 5\n\n[j]\nk = 6", ["a", "h", "j"]),
-        (["j", "h", "a"], "[j]\nk = 6\n\n[h]\ni = 5\n\n[a]\nb = 1\nc = 2\n\n[a.d]\ne = 3\n\n[a.f]\ng = 4", ["j", "h", "a"]),
-        (["h"], "[h]\ni = 5\n\n[a]\nb = 1\nc = 2\n\n[a.d]\ne = 3\n\n[a.f]\ng = 4\n\n[j]\nk = 6", ["h", "a", "j"])
-        # fmt: on
-    ],
-)
-def test_config_serialize_custom_sort(section_order, expected_str, expected_keys):
-    cfg = {
-        "j": {"k": 6},
-        "a": {"b": 1, "d": {"e": 3}, "c": 2, "f": {"g": 4}},
-        "h": {"i": 5},
-    }
-    cfg_str = Config(cfg).to_str()
-    assert Config(cfg, section_order=section_order).to_str() == expected_str
-    keys = list(Config(section_order=section_order).from_str(cfg_str).keys())
-    assert keys == expected_keys
-    keys = list(Config(cfg, section_order=section_order).keys())
-    assert keys == expected_keys
-
-
-def test_config_custom_sort_preserve():
-    """Test that sort order is preserved when merging and copying configs,
-    or when configs are filled and resolved."""
-    cfg = {"x": {}, "y": {}, "z": {}}
-    section_order = ["y", "z", "x"]
-    expected = "[y]\n\n[z]\n\n[x]"
-    config = Config(cfg, section_order=section_order)
-    assert config.to_str() == expected
-    config2 = config.copy()
-    assert config2.to_str() == expected
-    config3 = config.merge({"a": {}})
-    assert config3.to_str() == f"{expected}\n\n[a]"
-    config4 = Config(config)
-    assert config4.to_str() == expected
-    config_str = """[a]\nb = 1\n[c]\n@cats = "catsie.v1"\nevil = true\n\n[t]\n x = 2"""
-    section_order = ["c", "a", "t"]
-    config5 = Config(section_order=section_order).from_str(config_str)
-    assert list(config5.keys()) == section_order
-    filled = my_registry.fill(config5)
-    assert filled.section_order == section_order
-
-
-def test_config_pickle():
-    config = Config({"foo": "bar"}, section_order=["foo", "bar", "baz"])
-    data = pickle.dumps(config)
-    config_new = pickle.loads(data)
-    assert config_new == {"foo": "bar"}
-    assert config_new.section_order == ["foo", "bar", "baz"]
-
-
-def test_config_fill_extra_fields():
-    """Test that filling a config from a schema removes extra fields."""
-
-    class TestSchemaContent(BaseModel):
-        a: str
-        b: int
-
-        class Config:
-            extra = "forbid"
-
-    class TestSchema(BaseModel):
-        cfg: TestSchemaContent
-
-    config = Config({"cfg": {"a": "1", "b": 2, "c": True}})
-    with pytest.raises(ConfigValidationError):
-        my_registry.fill(config, schema=TestSchema)
-    filled = my_registry.fill(config, schema=TestSchema, validate=False)["cfg"]
-    assert filled == {"a": "1", "b": 2}
-    config2 = config.interpolate()
-    filled = my_registry.fill(config2, schema=TestSchema, validate=False)["cfg"]
-    assert filled == {"a": "1", "b": 2}
-    config3 = Config({"cfg": {"a": "1", "b": 2, "c": True}}, is_interpolated=False)
-    filled = my_registry.fill(config3, schema=TestSchema, validate=False)["cfg"]
-    assert filled == {"a": "1", "b": 2}
-
-    class TestSchemaContent2(BaseModel):
-        a: str
-        b: int
-
-        class Config:
-            extra = "allow"
-
-    class TestSchema2(BaseModel):
-        cfg: TestSchemaContent2
-
-    filled = my_registry.fill(config, schema=TestSchema2, validate=False)["cfg"]
-    assert filled == {"a": "1", "b": 2, "c": True}
-
-
-def test_config_validation_error_custom():
-    class Schema(BaseModel):
-        hello: int
-        world: int
-
-    config = {"hello": 1, "world": "hi!"}
-    with pytest.raises(ConfigValidationError) as exc_info:
-        my_registry._fill(config, Schema)
-    e1 = exc_info.value
-    assert e1.title == "Config validation error"
-    assert e1.desc is None
-    assert not e1.parent
-    assert e1.show_config is True
-    assert len(e1.errors) == 1
-    assert e1.errors[0]["loc"] == ("world",)
-    assert e1.errors[0]["msg"] == "value is not a valid integer"
-    assert e1.errors[0]["type"] == "type_error.integer"
-    assert e1.error_types == set(["type_error.integer"])
-    # Create a new error with overrides
-    title = "Custom error"
-    desc = "Some error description here"
-    e2 = ConfigValidationError.from_error(e1, title=title, desc=desc, show_config=False)
-    assert e2.errors == e1.errors
-    assert e2.error_types == e1.error_types
-    assert e2.title == title
-    assert e2.desc == desc
-    assert e2.show_config is False
-    assert e1.text != e2.text
-
-
-def test_config_parsing_error():
-    config_str = "[a]\nb c"
-    with pytest.raises(ConfigValidationError):
-        Config().from_str(config_str)
-
-
-def test_config_fill_without_resolve():
-    class BaseSchema(BaseModel):
-        catsie: int
-
-    config = {"catsie": {"@cats": "catsie.v1", "evil": False}}
-    filled = my_registry.fill(config)
-    resolved = my_registry.resolve(config)
-    assert resolved["catsie"] == "meow"
-    assert filled["catsie"]["cute"] is True
-    with pytest.raises(ConfigValidationError):
-        my_registry.resolve(config, schema=BaseSchema)
-    filled2 = my_registry.fill(config, schema=BaseSchema)
-    assert filled2["catsie"]["cute"] is True
-    resolved = my_registry.resolve(filled2)
-    assert resolved["catsie"] == "meow"
-    # With unavailable function
-    class BaseSchema2(BaseModel):
-        catsie: Any
-        other: int = 12
-
-    config = {"catsie": {"@cats": "dog", "evil": False}}
-    filled3 = my_registry.fill(config, schema=BaseSchema2)
-    assert filled3["catsie"] == config["catsie"]
-    assert filled3["other"] == 12
-
-
-def test_config_dataclasses():
-    @my_registry.cats("catsie.ragged")
-    def catsie_ragged(arg: Ragged):
-        return arg
-
-    data = numpy.zeros((20, 4), dtype="f")
-    lengths = numpy.array([4, 2, 8, 1, 4], dtype="i")
-    ragged = Ragged(data, lengths)
-    config = {"cfg": {"@cats": "catsie.ragged", "arg": ragged}}
-    result = my_registry.resolve(config)["cfg"]
-    assert isinstance(result, Ragged)
-    assert list(result._get_starts_ends()) == [0, 4, 6, 14, 15, 19]
-
-
-@pytest.mark.parametrize(
-    "greeting,value,expected",
-    [
-        # simple substitution should go fine
-        [342, "${vars.a}", int],
-        ["342", "${vars.a}", str],
-        ["everyone", "${vars.a}", str],
-    ],
-)
-def test_config_interpolates(greeting, value, expected):
-    str_cfg = f"""
-    [project]
-    my_par = {value}
-
-    [vars]
-    a = "something"
-    """
-    overrides = {"vars.a": greeting}
-    cfg = Config().from_str(str_cfg, overrides=overrides)
-    assert type(cfg["project"]["my_par"]) == expected
-
-
-@pytest.mark.parametrize(
-    "greeting,value,expected",
-    [
-        # fmt: off
-        # simple substitution should go fine
-        ["hello 342", "${vars.a}", "hello 342"],
-        ["hello everyone", "${vars.a}", "hello everyone"],
-        ["hello tout le monde", "${vars.a}", "hello tout le monde"],
-        ["hello 42", "${vars.a}", "hello 42"],
-        # substituting an element in a list
-        ["hello 342", "[1, ${vars.a}, 3]", "hello 342"],
-        ["hello everyone", "[1, ${vars.a}, 3]", "hello everyone"],
-        ["hello tout le monde", "[1, ${vars.a}, 3]", "hello tout le monde"],
-        ["hello 42", "[1, ${vars.a}, 3]", "hello 42"],
-        # substituting part of a string
-        [342, "hello ${vars.a}", "hello 342"],
-        ["everyone", "hello ${vars.a}", "hello everyone"],
-        ["tout le monde", "hello ${vars.a}", "hello tout le monde"],
-        pytest.param("42", "hello ${vars.a}", "hello 42", marks=pytest.mark.xfail),
-        # substituting part of a implicit string inside a list
-        [342, "[1, hello ${vars.a}, 3]", "hello 342"],
-        ["everyone", "[1, hello ${vars.a}, 3]", "hello everyone"],
-        ["tout le monde", "[1, hello ${vars.a}, 3]", "hello tout le monde"],
-        pytest.param("42", "[1, hello ${vars.a}, 3]", "hello 42", marks=pytest.mark.xfail),
-        # substituting part of a explicit string inside a list
-        [342, "[1, 'hello ${vars.a}', '3']", "hello 342"],
-        ["everyone", "[1, 'hello ${vars.a}', '3']", "hello everyone"],
-        ["tout le monde", "[1, 'hello ${vars.a}', '3']", "hello tout le monde"],
-        pytest.param("42", "[1, 'hello ${vars.a}', '3']", "hello 42", marks=pytest.mark.xfail),
-        # more complicated example
-        [342, "[{'name':'x','script':['hello ${vars.a}']}]", "hello 342"],
-        ["everyone", "[{'name':'x','script':['hello ${vars.a}']}]", "hello everyone"],
-        ["tout le monde", "[{'name':'x','script':['hello ${vars.a}']}]", "hello tout le monde"],
-        pytest.param("42", "[{'name':'x','script':['hello ${vars.a}']}]", "hello 42", marks=pytest.mark.xfail),
-        # fmt: on
-    ],
-)
-def test_config_overrides(greeting, value, expected):
-    str_cfg = f"""
-    [project]
-    commands = {value}
-
-    [vars]
-    a = "world"
-    """
-    overrides = {"vars.a": greeting}
-    assert "${vars.a}" in str_cfg
-    cfg = Config().from_str(str_cfg, overrides=overrides)
-    assert expected in str(cfg)
-
-
 def test_arg_order_is_preserved():
     str_cfg = """
     [model]
diff --git a/website/docs/usage-config.md b/website/docs/usage-config.md
index abb6951e4..73a1638ac 100644
--- a/website/docs/usage-config.md
+++ b/website/docs/usage-config.md
@@ -12,15 +12,16 @@ And then once those settings are added, they become hard to remove later.
 Default values also become hard to change without breaking backwards
 compatibility.
 
-To solve this problem, Thinc provides a config system that lets you easily
-describe **arbitrary trees of objects**. The objects can be created via
-**function calls you register** using a simple decorator syntax. You can even
-version the functions you create, allowing you to make improvements without
-breaking backwards compatibility. The most similar config system we're aware of
-is [Gin](https://github.com/google/gin-config), which uses a similar syntax, and
-also allows you to link the configuration system to functions in your code using
-a decorator. Thinc's config system is simpler and emphasizes a different
-workflow via a subset of Gin's functionality.
+To solve this problem, Thinc leverages
+[confection](https://github.com/explosion/confection) - a config system that
+lets you easily describe **arbitrary trees of objects**. The objects can be
+created via **function calls you register** using a simple decorator syntax. You
+can even version the functions you create, allowing you to make improvements
+without breaking backwards compatibility. The most similar config system we're
+aware of is [Gin](https://github.com/google/gin-config), which uses a similar
+syntax, and also allows you to link the configuration system to functions in
+your code using a decorator. Thinc's config system is simpler and emphasizes a
+different workflow via a subset of Gin's functionality.
 
 <grid>
 
@@ -654,11 +655,11 @@ resolved = registry.resolve(
 The main motivation for Thinc's configuration system was to eliminate hidden
 defaults and ensure that config settings are passed around consistently. This
 also means that config files should always define **all available settings**.
-The [`registry.fill`](/docs/api-config#registry-fill) method also
-resolves the config, but it leaves references to registered functions intact and
-doesn't replace them with their return values. If type annotations and/or a base
-schema are available, they will be used to parse the config and fill in any
-missing values and defaults to create an up-to-date "master config".
+The [`registry.fill`](/docs/api-config#registry-fill) method also resolves the
+config, but it leaves references to registered functions intact and doesn't
+replace them with their return values. If type annotations and/or a base schema
+are available, they will be used to parse the config and fill in any missing
+values and defaults to create an up-to-date "master config".
 
 Let's say you've updated your schema and scripts to use two additional optional
 settings. These settings should also be reflected in your config files so they
@@ -677,9 +678,9 @@ class TrainingSchema(BaseModel):
     max_epochs: StrictInt = 100
 ```
 
-Calling [`registry.fill`](/docs/api-config#registry-fill) with your
-existing config will produce an updated version of it including the new settings
-and their defaults:
+Calling [`registry.fill`](/docs/api-config#registry-fill) with your existing
+config will produce an updated version of it including the new settings and
+their defaults:
 
 <grid>
 

From eda4c75771667aee114e640b9c1e4021a514a275 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Mon, 29 Aug 2022 14:58:30 +0200
Subject: [PATCH 60/78] `PyTorchGradScaler`: Cache `_found_inf` on the CPU
 (#746)

* `PyTorchGradScaler`: Cache `_found_inf` on the CPU

This prevents unnecessary overhead from launching kernels on the GPU in hot backward passes.

* Only pin `_found_inf` to the CPU

* Always store `_found_inf` as a `bool`
---
 thinc/shims/pytorch_grad_scaler.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/thinc/shims/pytorch_grad_scaler.py b/thinc/shims/pytorch_grad_scaler.py
index 999ed0047..8db11bcae 100644
--- a/thinc/shims/pytorch_grad_scaler.py
+++ b/thinc/shims/pytorch_grad_scaler.py
@@ -51,12 +51,11 @@ def __init__(
         self._backoff_factor = backoff_factor
         self._growth_interval = growth_interval
 
-        self._found_inf = torch.full((1,), 0.0)
         self._growth_tracker = torch.full((1,), 0, dtype=torch.int)
         self._scale = torch.full((1,), init_scale)
+        self._found_inf = False
 
     def to_(self, device):
-        self._found_inf = self._found_inf.to(device)
         self._growth_tracker = self._growth_tracker.to(device)
         self._scale = self._scale.to(device)
 
@@ -132,7 +131,7 @@ def _tensors_per_device(self, tensors):
 
     @property
     def found_inf(self):
-        return bool(self._found_inf) != 0
+        return self._found_inf
 
     def unscale(self, tensors):
         """Unscale the given tensors. Returns True if any of the gradients were infinite."""
@@ -152,9 +151,10 @@ def unscale(self, tensors):
                 device_tensors, found_inf_device, inv_scale_device
             )
 
-            self._found_inf += found_inf_device.to(self._found_inf.device)
+            if bool(found_inf_device != 0):
+                self._found_inf = True
 
-        return bool(self._found_inf != 0)
+        return self._found_inf
 
     def update(self):
         """
@@ -165,14 +165,17 @@ def update(self):
         if not self._enabled:
             return
 
+        found_inf_device = torch.full(
+            (1,), 1.0 if self._found_inf else 0.0, device=self._scale.device
+        )
         torch._amp_update_scale_(
             self._scale,
             self._growth_tracker,
-            self._found_inf,
+            found_inf_device,
             self._growth_factor,
             self._backoff_factor,
             self._growth_interval,
         )
 
         # Clear infinity found status
-        self._found_inf = torch.zeros_like(self._found_inf)
+        self._found_inf = False

From a7bbc48bba679fd2b14442eebd19369885fa7305 Mon Sep 17 00:00:00 2001
From: kadarakos <kadar.akos@gmail.com>
Date: Fri, 2 Sep 2022 14:07:21 +0200
Subject: [PATCH 61/78] More general remap_ids (#726)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* work with cupy arrays and 2d arrays

* force mypy pass

* addressing comments

* return correct shape empty array

* test remap_ids with Ints2d

* Update thinc/layers/remap_ids.py

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>

* use numpy array

* remove cupy import

* mini fix

* more strict typing

* adjust test

* Update thinc/layers/remap_ids.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* remove check

* Update thinc/layers/remap_ids.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* address reviews

* Update thinc/layers/remap_ids.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* simplify casting

* Update thinc/layers/remap_ids.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update thinc/layers/remap_ids.py

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* remap_ids legacy

* legacy

* test version 1 and 2

* rename legacy to v1

* adding old test back

* remap_ids docs update

* Update website/docs/api-layers.md

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update website/docs/api-layers.md

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* make init/forward attribute setting more clear

* Update website/docs/api-layers.md

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update website/docs/api-layers.md

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* Update website/docs/api-layers.md

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

* prettier

* update model type

* prettier

* Use new _v2 instead of renamed _v1

Co-authored-by: Daniël de Kok <me@github.danieldk.eu>
Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 thinc/layers/__init__.py              |  2 +-
 thinc/layers/remap_ids.py             | 70 ++++++++++++++++++++++++---
 thinc/tests/layers/test_layers_api.py |  3 +-
 website/docs/api-layers.md            | 29 ++++++-----
 4 files changed, 84 insertions(+), 20 deletions(-)

diff --git a/thinc/layers/__init__.py b/thinc/layers/__init__.py
index 19fee1329..3a81851b4 100644
--- a/thinc/layers/__init__.py
+++ b/thinc/layers/__init__.py
@@ -59,7 +59,7 @@
 from .list2padded import list2padded
 from .ragged2list import ragged2list
 from .padded2list import padded2list
-from .remap_ids import remap_ids
+from .remap_ids import remap_ids, remap_ids_v2
 from .strings2arrays import strings2arrays
 from .with_array import with_array
 from .with_array2d import with_array2d
diff --git a/thinc/layers/remap_ids.py b/thinc/layers/remap_ids.py
index f0f010acb..265b24a9d 100644
--- a/thinc/layers/remap_ids.py
+++ b/thinc/layers/remap_ids.py
@@ -1,18 +1,23 @@
-from typing import Tuple, Callable, Sequence, Dict, Any
+from typing import Tuple, Callable, Sequence, cast
+from typing import Dict, Union, Optional, Hashable, Any
 
 from ..model import Model
 from ..config import registry
-from ..types import Ints2d, DTypes
+from ..types import Ints1d, Ints2d, DTypes
+from ..util import is_xp_array, to_numpy
 
 
-InT = Sequence[Any]
+InT = Union[Sequence[Hashable], Ints1d, Ints2d]
 OutT = Ints2d
 
+InT_v1 = Sequence[Any]
+OutT_v1 = Ints2d
+
 
 @registry.layers("remap_ids.v1")
 def remap_ids(
     mapping_table: Dict[Any, int] = {}, default: int = 0, dtype: DTypes = "i"
-) -> Model[InT, OutT]:
+) -> Model[InT_v1, OutT_v1]:
     """Remap string or integer inputs using a mapping table, usually as a
     preprocess before embeddings. The mapping table can be passed in on input,
     or updated after the layer has been created. The mapping table is stored in
@@ -26,7 +31,7 @@ def remap_ids(
 
 
 def forward(
-    model: Model[InT, OutT], inputs: InT, is_train: bool
+    model: Model[InT_v1, OutT_v1], inputs: InT_v1, is_train: bool
 ) -> Tuple[OutT, Callable]:
     table = model.attrs["mapping_table"]
     default = model.attrs["default"]
@@ -35,7 +40,60 @@ def forward(
     arr = model.ops.asarray2i(values, dtype=dtype)
     output = model.ops.reshape2i(arr, -1, 1)
 
-    def backprop(dY: OutT) -> InT:
+    def backprop(dY: OutT_v1) -> InT:
         return []
 
     return output, backprop
+
+
+@registry.layers("remap_ids.v2")
+def remap_ids_v2(
+    mapping_table: Optional[Union[Dict[int, int], Dict[str, int]]] = None,
+    default: int = 0,
+    *,
+    column: Optional[int] = None
+) -> Model[InT, OutT]:
+    """Remap string or integer inputs using a mapping table,
+    usually as a preprocessing step before embeddings.
+    The mapping table can be passed in on input,
+    or updated after the layer has been created.
+    The mapping table is stored in the "mapping_table" attribute.
+    Two dimensional arrays can be provided as input in which case
+    the 'column' chooses which column to process. This is useful
+    to work together with FeatureExtractor in spaCy.
+    """
+    return Model(
+        "remap_ids",
+        forward_v2,
+        attrs={"mapping_table": mapping_table, "default": default, "column": column},
+    )
+
+
+def forward_v2(
+    model: Model[InT, OutT], inputs: InT, is_train: bool
+) -> Tuple[OutT, Callable]:
+    table = model.attrs["mapping_table"]
+    if table is None:
+        raise ValueError("'mapping table' not set")
+    default = model.attrs["default"]
+    column = model.attrs["column"]
+    if is_xp_array(inputs):
+        xp_input = True
+        if column is not None:
+            idx = to_numpy(cast(Ints2d, inputs)[:, column])
+        else:
+            idx = to_numpy(inputs)
+    else:
+        xp_input = False
+        idx = inputs
+    values = [table.get(x, default) for x in idx]
+    arr = model.ops.asarray2i(values, dtype="i")
+    output = model.ops.reshape2i(arr, -1, 1)
+
+    def backprop(dY: OutT) -> InT:
+        if xp_input:
+            return model.ops.xp.empty(dY.shape)  # type: ignore
+        else:
+            return []
+
+    return output, backprop
diff --git a/thinc/tests/layers/test_layers_api.py b/thinc/tests/layers/test_layers_api.py
index 3ebeb470a..c6e40c3fa 100644
--- a/thinc/tests/layers/test_layers_api.py
+++ b/thinc/tests/layers/test_layers_api.py
@@ -128,7 +128,8 @@ def assert_data_match(Y, out_data):
     # ("CauchySimilarity.v1", {}, (array2d, array2d), array1d),
     ("ParametricAttention.v1", {}, ragged, ragged),
     ("SparseLinear.v1", {}, (numpy.asarray([1, 2, 3], dtype="uint64"), array1d, numpy.asarray([1, 1], dtype="i")), array2d),
-    ("remap_ids.v1", {"dtype": "f"}, ["a", 1, 5.0], array2dint)
+    ("remap_ids.v1", {"dtype": "f"}, ["a", 1, 5.0], array2dint),
+    ("remap_ids.v2", {"mapping_table": {}, "column": 1}, numpy.array([[1, 2, 3], [4, 5, 6]]).T, array2dint)
     # fmt: on
 ]
 
diff --git a/website/docs/api-layers.md b/website/docs/api-layers.md
index b1e72f7d7..e204c4c46 100644
--- a/website/docs/api-layers.md
+++ b/website/docs/api-layers.md
@@ -1267,22 +1267,27 @@ https://github.com/explosion/thinc/blob/master/thinc/layers/padded2list.py
 
 <inline-list>
 
-- **Input:** <tt>Sequence[Any]</tt>
+- **Input:** <tt>Union[Sequence[Hashable], Ints1d, Ints2d]</tt>
 - **Output:** <ndarray>Ints2d</ndarray>
 
 </inline-list>
 
-Remap string or integer inputs using a mapping table, usually as a preprocess
-before embeddings. The mapping table can be passed in on input, or updated after
-the layer has been created. The mapping table is stored in the `"mapping_table"`
-attribute.
-
-| Argument        | Type                                  | Description                                                                                                  |
-| --------------- | ------------------------------------- | ------------------------------------------------------------------------------------------------------------ |
-| `mapping_table` | <tt>Dict[Any, int]</tt>               | The mapping table to use. Can also be set after initialization by writing to `model.attrs["mapping_table"]`. |
-| `default`       | <tt>int</tt>                          | The default value if the input does not have an entry in the mapping table.                                  |
-| `dtype`         | <tt>DTypes</tt>                       | The data type of the array.                                                                                  |
-| **RETURNS**     | <tt>Model[Sequence[Any], Ints2d]</tt> | The layer to compute the transformation.                                                                     |
+Remap a sequence of strings, integers or other hashable inputs using a mapping
+table, usually as a preprocessing step before embeddings. The input can also be
+a two dimensional integer array in which case the `column` attribute tells the
+`remap_ids` layer which column of the array to map with the `mapping_table`.
+Both attributes can be passed on initialization, but since the layer is designed
+to retrieve them from `model.attrs` during `forward`, they can be set any time
+before calling `forward`. This means that they can also be changed between
+calls. Before calling `forward` the `mapping_table` has to be set and for 2D
+inputs the `column` is also required.
+
+| Argument        | Type                                                              | Description                                                                                                  |
+| --------------- | ----------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ |
+| `mapping_table` | <tt>Dict[Any, int]</tt>                                           | The mapping table to use. Can also be set after initialization by writing to `model.attrs["mapping_table"]`. |
+| `default`       | <tt>int</tt>                                                      | The default value if the input does not have an entry in the mapping table.                                  |
+| `column`        | <tt>int</tt>                                                      | The column to apply the mapper to in case of 2D input.                                                       |
+| **RETURNS**     | <tt>Model[Union[Sequence[Hashable], Ints1d, Ints2d], Ints2d]</tt> | The layer to compute the transformation.                                                                     |
 
 ```python
 https://github.com/explosion/thinc/blob/master/thinc/layers/remap_ids.py

From 102d65458700e3df258694ca9399049a05e7fed3 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 5 Sep 2022 10:23:57 +0200
Subject: [PATCH 62/78] Auto-format code with black (#753)

Co-authored-by: explosion-bot <explosion-bot@users.noreply.github.com>
---
 thinc/tests/test_config.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/thinc/tests/test_config.py b/thinc/tests/test_config.py
index fcba87f4b..0dceadfc4 100644
--- a/thinc/tests/test_config.py
+++ b/thinc/tests/test_config.py
@@ -135,7 +135,6 @@ def catsie_v2(evil: StrictBool, cute: bool = True, cute_level: int = 1) -> str:
 worst_catsie = {"@cats": "catsie.v1", "evil": True, "cute": False}
 
 
-
 def test_make_config_positional_args_dicts():
     cfg = {
         "hyper_params": {"n_hidden": 512, "dropout": 0.2, "learn_rate": 0.001},

From fba3bf02d6f7aacdfbba4850a9e1d07ce08fa14b Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 6 Sep 2022 13:31:43 +0200
Subject: [PATCH 63/78] Switch to macos-latest (#755)

---
 azure-pipelines.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 08d5ed93c..c81c62689 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -23,7 +23,7 @@ jobs:
         imageName: 'windows-2019'
         python.version: '3.6'
       Python37Mac:
-        imageName: 'macos-10.15'
+        imageName: 'macos-latest'
         python.version: '3.7'
       Python38Linux:
         imageName: 'ubuntu-latest'

From fc323e177a6edd70de60b56b581addf2aa3e9371 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan <shadeMe@users.noreply.github.com>
Date: Wed, 7 Sep 2022 18:00:08 +0200
Subject: [PATCH 64/78] `util`: Explicitly call `__dlpack__` built-in method in
 `xp2tensorflow` (#757)

`tf.experimental.dlpack.from_dlpack` expects a `PyCapsule` object.
---
 thinc/util.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/thinc/util.py b/thinc/util.py
index 7add0aaf7..43fb115d2 100644
--- a/thinc/util.py
+++ b/thinc/util.py
@@ -404,7 +404,8 @@ def xp2tensorflow(
         dlpack_tensor = xp_tensor.toDlpack()  # type: ignore
         tf_tensor = tf.experimental.dlpack.from_dlpack(dlpack_tensor)
     elif hasattr(xp_tensor, "__dlpack__"):
-        tf_tensor = tf.experimental.dlpack.from_dlpack(xp_tensor)
+        dlpack_tensor = xp_tensor.__dlpack__()  # type: ignore
+        tf_tensor = tf.experimental.dlpack.from_dlpack(dlpack_tensor)
     else:
         tf_tensor = tf.convert_to_tensor(xp_tensor)
     if as_variable:

From 9836e9ee2e4f2fbf38947ec171ca12b883adaaec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= <me@danieldk.eu>
Date: Fri, 9 Sep 2022 11:45:49 +0200
Subject: [PATCH 65/78] Set version to 8.1.1 (#758)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 9388517fe..080f6f7a6 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.1.0"
+__version__ = "8.1.1"
 __release__ = True

From 97a1a04346da9d5d7453e6f3ff1c28890757bb73 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Fri, 9 Sep 2022 15:07:02 +0200
Subject: [PATCH 66/78] Remove references to FastAPI being an Explosion product
 (#761)

* Remove references to FastAPI being an Explosion product.

* Remove period at end of subheader.
---
 README.md                  | 2 +-
 website/src/pages/index.js | 8 ++------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index b0c7f3937..488f5c9db 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 # Thinc: A refreshing functional take on deep learning, compatible with your favorite libraries
 
-### From the makers of [spaCy](https://spacy.io), [Prodigy](https://prodi.gy) and [FastAPI](https://fastapi.tiangolo.com)
+### From the makers of [spaCy](https://spacy.io) and [Prodigy](https://prodi.gy)
 
 [Thinc](https://thinc.ai) is a **lightweight deep learning library** that offers an elegant,
 type-checked, functional-programming API for **composing models**, with support
diff --git a/website/src/pages/index.js b/website/src/pages/index.js
index 037d9e3fc..2318fc8b2 100644
--- a/website/src/pages/index.js
+++ b/website/src/pages/index.js
@@ -20,14 +20,10 @@ export default () => (
                     from the makers of <br className={classes.smOnly} />
                     <Link to="https://spacy.io" hidden>
                         spaCy
-                    </Link>
-                    ,{' '}
-                    <Link to="https://prodi.gy" hidden>
-                        Prodigy
                     </Link>{' '}
                     &amp;{' '}
-                    <Link to="https://fastapi.tiangolo.com" hidden>
-                        FastAPI
+                    <Link to="https://prodi.gy" hidden>
+                        Prodigy
                     </Link>
                 </span>
             </H2>

From 139acbfa0ba1955142836403f53ecb8709ab40c3 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch <r.mitsch@outlook.com>
Date: Fri, 9 Sep 2022 16:47:56 +0200
Subject: [PATCH 67/78] Update code example for Ragged (#756)

* Update code example for Ragged.

* Import from thinc.api.
---
 website/docs/usage-sequences.md | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/website/docs/usage-sequences.md b/website/docs/usage-sequences.md
index 47862ca50..72a4ed65c 100644
--- a/website/docs/usage-sequences.md
+++ b/website/docs/usage-sequences.md
@@ -146,9 +146,7 @@ of your network.
 
 ```python
 ### Example
-from thinc.api import Ragged
-
-from thinc.api import get_current_ops, Ragged, Linear
+from thinc.api import get_current_ops, Ragged, Linear, list2ragged
 
 ops = get_current_ops()
 sequences = [
@@ -156,14 +154,15 @@ sequences = [
     ops.alloc2f(2, 5) + 2,
     ops.alloc2f(4, 5) + 3,
 ]
-ragged = ops.list2ragged(sequences)
+list2ragged_model = list2ragged()
+ragged = list2ragged_model.predict(sequences)
 assert ragged.data.shape == (13, 5)
 # This will always be true:
 assert ragged.data.shape[0] == ragged.lengths.sum()
 # Data from sequence 0 is in the first 7 rows, followed by seqs 1 and 2
-assert ragged.data[:7] == 1
-assert ragged.data[7:2] == 2
-assert ragged.data[9:] == 3
+assert (ragged.data[:7] == 1).all()
+assert (ragged.data[7:2] == 2).all()
+assert (ragged.data[9:] == 3).all()
 # Indexing gets the batch item, and returns a Ragged object
 ragged[0].data.shape == (7, 5)
 # You can pass the data straight into dense layers
@@ -173,7 +172,7 @@ ragged_out = Ragged(output, ragged.lengths)
 # Internally, data is reshaped to 2d. The original shape is accessible at the
 # the dataXd property.
 sequences3d = [ops.alloc3f(5, 6, 7), ops.alloc3f(10, 6, 7)]
-ragged3d = ops.list2ragged(sequences3d)
+ragged3d = list2ragged_model.predict(sequences3d)
 ragged3d.data.shape == (15, 13)
 ragged3d.dataXd.shape == (15, 6, 7)
 ```

From 37958ee8a040b5d9f07e962483d27a1fc128f542 Mon Sep 17 00:00:00 2001
From: Will Frey <jfrey89@gmail.com>
Date: Mon, 12 Sep 2022 02:56:54 -0400
Subject: [PATCH 68/78] Update setup.cfg (#748)

Register fix_random_seed as a pytest-randomly entry point.
---
 setup.cfg | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index 2c147b995..0f8bd1b31 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -54,6 +54,10 @@ install_requires =
     dataclasses>=0.6,<1.0; python_version < "3.7"
     typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"
     contextvars>=2.4,<3; python_version < "3.7"
+    
+[options.entry_points]
+pytest_randomly.random_seeder =
+    thinc = thinc.api.fix_random_seed
 
 [options.extras_require]
 cuda =

From 562139e4b0579f8ce32f877ee4531aa8455e9bb7 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Tue, 13 Sep 2022 09:05:23 +0200
Subject: [PATCH 69/78] Update cupy extras, quickstart (#740)

* Update cupy extras, quickstart

* Rename extra cuda-wheel to cuda-autodetect
---
 setup.cfg                     | 8 ++++++++
 website/docs/_quickstart.json | 7 +------
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 0f8bd1b31..28d4c5edb 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -88,6 +88,14 @@ cuda114 =
     cupy-cuda114>=5.0.0b4
 cuda115 =
     cupy-cuda115>=5.0.0b4
+cuda116 =
+    cupy-cuda116>=5.0.0b4
+cuda117 =
+    cupy-cuda117>=5.0.0b4
+cuda11x =
+    cupy-cuda11x>=11.0.0
+cuda-autodetect =
+    cupy-wheel>=11.0.0
 datasets =
     ml_datasets>=0.2.0,<0.3.0
 torch =
diff --git a/website/docs/_quickstart.json b/website/docs/_quickstart.json
index b3f258070..bed6629a8 100644
--- a/website/docs/_quickstart.json
+++ b/website/docs/_quickstart.json
@@ -12,12 +12,7 @@
                 { "label": "9.2", "value": "cuda92" },
                 { "label": "10.0", "value": "cuda100" },
                 { "label": "10.1", "value": "cuda101" },
-                { "label": "11.0", "value": "cuda110" },
-                { "label": "11.1", "value": "cuda111" },
-                { "label": "11.2", "value": "cuda112" },
-                { "label": "11.3", "value": "cuda113" },
-                { "label": "11.4", "value": "cuda114" },
-                { "label": "11.5", "value": "cuda115" }
+                { "label": "10.2, 11.0+", "value": "cuda-autodetect" }
             ]
         },
         {

From 20ce7039e15a197680672b85caef6e423f144874 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 15 Sep 2022 16:59:56 +0200
Subject: [PATCH 70/78] disable mypy run for Python 3.10 (#768)

* disable mypy run for Python 3.10

* dot
---
 azure-pipelines.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index c81c62689..f9096029c 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -63,6 +63,7 @@ jobs:
   - script: |
       python -m mypy thinc
     displayName: 'Run mypy'
+    condition: ne(variables['python.version'], '3.10')
 
   - task: DeleteFiles@1
     inputs:

From 4dffe21c4f6cf32ddcc61242a50e6e944d1db876 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 16 Sep 2022 15:01:23 +0200
Subject: [PATCH 71/78] Reorder requirements in requirements.txt (#770)

Move `confection` to the section with required explosion packages.
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 6a04190da..04613e6bd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,6 +6,7 @@ blis>=0.7.8,<0.10.0
 srsly>=2.4.0,<3.0.0
 wasabi>=0.8.1,<1.1.0
 catalogue>=2.0.4,<2.1.0
+confection>=0.0.1,<1.0.0
 ml_datasets>=0.2.0,<0.3.0
 # Third-party dependencies
 pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
@@ -34,4 +35,3 @@ nbformat>=5.0.4,<5.2.0
 # Test to_disk/from_disk against pathlib.Path subclasses
 pathy>=0.3.5
 black>=22.0,<23.0
-confection>=0.0.1,<1.0.0
\ No newline at end of file

From cb6edbe21e9d1a53cb34eac13a48e8af0c5c4e45 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 26 Sep 2022 15:04:20 +0200
Subject: [PATCH 72/78] Revert blis range to <0.8.0 (#772)

Due to more reports of access violations in windows, reduce supported
blis versions back to `<0.8.0`.
---
 pyproject.toml   | 2 +-
 requirements.txt | 2 +-
 setup.cfg        | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6fcca505e..d3fb69b76 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "murmurhash>=1.0.2,<1.1.0",
     "cymem>=2.0.2,<2.1.0",
     "preshed>=3.0.2,<3.1.0",
-    "blis>=0.7.8,<0.10.0",
+    "blis>=0.7.8,<0.8.0",
     "numpy>=1.15.0",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/requirements.txt b/requirements.txt
index 04613e6bd..7d7ee1b90 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@
 murmurhash>=1.0.2,<1.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-blis>=0.7.8,<0.10.0
+blis>=0.7.8,<0.8.0
 srsly>=2.4.0,<3.0.0
 wasabi>=0.8.1,<1.1.0
 catalogue>=2.0.4,<2.1.0
diff --git a/setup.cfg b/setup.cfg
index 28d4c5edb..17565e0f6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -35,10 +35,10 @@ setup_requires =
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0
     murmurhash>=1.0.2,<1.1.0
-    blis>=0.7.8,<0.10.0
+    blis>=0.7.8,<0.8.0
 install_requires =
     # Explosion-provided dependencies
-    blis>=0.7.8,<0.10.0
+    blis>=0.7.8,<0.8.0
     murmurhash>=1.0.2,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0

From d9c40cf7189492fa56eeb59e63076b72dcc46d42 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 26 Sep 2022 15:46:22 +0200
Subject: [PATCH 73/78] Set version to v8.1.2 (#773)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 080f6f7a6..7b1444a05 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.1.1"
+__version__ = "8.1.2"
 __release__ = True

From 1eaeb2bcabe9feb3153b829950ac05eb13bc818e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timoth=C3=A9e=20Mazzucotelli?= <pawamoy@pm.me>
Date: Tue, 27 Sep 2022 17:21:34 +0200
Subject: [PATCH 74/78] Fix `fix_random_seed` entrypoint in setup.cfg (#775)

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 17565e0f6..e1feed06d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -57,7 +57,7 @@ install_requires =
     
 [options.entry_points]
 pytest_randomly.random_seeder =
-    thinc = thinc.api.fix_random_seed
+    thinc = thinc.api:fix_random_seed
 
 [options.extras_require]
 cuda =

From 3a143d374161df0334fd29c714b8d93819c7bd0d Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Tue, 4 Oct 2022 14:25:32 +0200
Subject: [PATCH 75/78] Support both Python 3.6 and Pydantic 1.10 (#779)

* support both Python 3.6 and Pydantic 1.10

* Simplify according to Adriane's suggestion

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>

Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 7d7ee1b90..a94633e75 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,7 @@ catalogue>=2.0.4,<2.1.0
 confection>=0.0.1,<1.0.0
 ml_datasets>=0.2.0,<0.3.0
 # Third-party dependencies
-pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
+pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
 numpy>=1.15.0
 # Backports of modern Python features
 dataclasses>=0.6,<1.0; python_version < "3.7"
diff --git a/setup.cfg b/setup.cfg
index e1feed06d..a0347c55e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -49,7 +49,7 @@ install_requires =
     # Third-party dependencies
     setuptools
     numpy>=1.15.0
-    pydantic>=1.7.4,!=1.8,!=1.8.1,<1.10.0
+    pydantic>=1.7.4,!=1.8,!=1.8.1,<1.11.0
     # Backports of modern Python features
     dataclasses>=0.6,<1.0; python_version < "3.7"
     typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"

From 2e12baaa15472d6225b01e9d3f8127e90d3c0580 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 7 Oct 2022 08:10:33 +0200
Subject: [PATCH 76/78] update to latest mypy and exclude Python 3.6 (#776)

* update to latest mypy and exclude Python 3.6

* fix typing of ops.alloc

* fix ArrayT usage in types.py
---
 azure-pipelines.yml                      |  2 +-
 requirements.txt                         |  2 +-
 thinc/backends/ops.py                    | 26 ++++++++++++------------
 thinc/initializers.py                    |  2 +-
 thinc/layers/hashembed.py                |  2 +-
 thinc/tests/layers/test_mxnet_wrapper.py |  2 +-
 thinc/types.py                           |  4 ++--
 7 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index f9096029c..8c80569cf 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -63,7 +63,7 @@ jobs:
   - script: |
       python -m mypy thinc
     displayName: 'Run mypy'
-    condition: ne(variables['python.version'], '3.10')
+    condition: ne(variables['python.version'], '3.6')
 
   - task: DeleteFiles@1
     inputs:
diff --git a/requirements.txt b/requirements.txt
index a94633e75..3f9ab983b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,7 +23,7 @@ pytest-cov>=2.7.0,<2.8.0
 coverage>=5.0.0,<6.0.0
 mock>=2.0.0,<3.0.0
 flake8>=3.5.0,<3.6.0
-mypy>=0.901,<0.970; platform_machine!='aarch64'
+mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7"
 types-mock>=0.1.1
 types-contextvars>=0.1.2; python_version < "3.7"
 types-dataclasses>=0.1.3; python_version < "3.7"
diff --git a/thinc/backends/ops.py b/thinc/backends/ops.py
index 29b4eca4f..eb9945e27 100644
--- a/thinc/backends/ops.py
+++ b/thinc/backends/ops.py
@@ -373,7 +373,7 @@ def pad(  # noqa: F811
         # array sizes.
         length = (length + (round_to - 1)) // round_to * round_to
         final_shape = (len(seqs), length) + seqs[0].shape[1:]
-        output: Array3d = self.alloc(final_shape, dtype=seqs[0].dtype)
+        output: Array3d = cast(Array3d, self.alloc(final_shape, dtype=seqs[0].dtype))
         for i, arr in enumerate(seqs):
             # It's difficult to convince this that the dtypes will match.
             output[i, : arr.shape[0]] = arr  # type: ignore[assignment, call-overload]
@@ -451,7 +451,7 @@ def get_dropout_mask(self, shape: Shape, drop: Optional[float]) -> FloatsXd:
         if drop is None or drop <= 0:
             return self.xp.ones(shape, dtype="f")
         elif drop >= 1.0:
-            return self.alloc(shape)
+            return self.alloc_f(shape)
         coinflips = self.xp.random.uniform(0.0, 1.0, shape)
         mask = (coinflips >= drop) / (1.0 - drop)
         return cast(FloatsXd, self.asarray(mask, dtype="float32"))
@@ -463,7 +463,7 @@ def alloc1f(
         dtype: Optional[DTypesFloat] = "float32",
         zeros: bool = True,
     ) -> Floats1d:
-        return self.alloc((d0,), dtype=dtype, zeros=zeros)
+        return cast(Floats1d, self.alloc((d0,), dtype=dtype, zeros=zeros))
 
     def alloc2f(
         self,
@@ -473,7 +473,7 @@ def alloc2f(
         dtype: Optional[DTypesFloat] = "float32",
         zeros: bool = True,
     ) -> Floats2d:
-        return self.alloc((d0, d1), dtype=dtype, zeros=zeros)
+        return cast(Floats2d, self.alloc((d0, d1), dtype=dtype, zeros=zeros))
 
     def alloc3f(
         self,
@@ -484,7 +484,7 @@ def alloc3f(
         dtype: Optional[DTypesFloat] = "float32",
         zeros: bool = True,
     ) -> Floats3d:
-        return self.alloc((d0, d1, d2), dtype=dtype, zeros=zeros)
+        return cast(Floats3d, self.alloc((d0, d1, d2), dtype=dtype, zeros=zeros))
 
     def alloc4f(
         self,
@@ -496,7 +496,7 @@ def alloc4f(
         dtype: Optional[DTypesFloat] = "float32",
         zeros: bool = True,
     ) -> Floats4d:
-        return self.alloc((d0, d1, d2, d3), dtype=dtype, zeros=zeros)
+        return cast(Floats4d, self.alloc((d0, d1, d2, d3), dtype=dtype, zeros=zeros))
 
     def alloc_f(
         self,
@@ -505,7 +505,7 @@ def alloc_f(
         dtype: Optional[DTypesFloat] = "float32",
         zeros: bool = True,
     ) -> FloatsXd:
-        return self.alloc(shape, dtype=dtype, zeros=zeros)
+        return cast(FloatsXd, self.alloc(shape, dtype=dtype, zeros=zeros))
 
     def alloc1i(
         self,
@@ -514,7 +514,7 @@ def alloc1i(
         dtype: Optional[DTypesInt] = "int32",
         zeros: bool = True,
     ) -> Ints1d:
-        return self.alloc((d0,), dtype=dtype, zeros=zeros)
+        return cast(Ints1d, self.alloc((d0,), dtype=dtype, zeros=zeros))
 
     def alloc2i(
         self,
@@ -524,7 +524,7 @@ def alloc2i(
         dtype: Optional[DTypesInt] = "int32",
         zeros: bool = True,
     ) -> Ints2d:
-        return self.alloc((d0, d1), dtype=dtype, zeros=zeros)
+        return cast(Ints2d, self.alloc((d0, d1), dtype=dtype, zeros=zeros))
 
     def alloc3i(
         self,
@@ -535,7 +535,7 @@ def alloc3i(
         dtype: Optional[DTypesInt] = "int32",
         zeros: bool = True,
     ) -> Ints3d:
-        return self.alloc((d0, d1, d2), dtype=dtype, zeros=zeros)
+        return cast(Ints3d, self.alloc((d0, d1, d2), dtype=dtype, zeros=zeros))
 
     def alloc4i(
         self,
@@ -547,7 +547,7 @@ def alloc4i(
         dtype: Optional[DTypesInt] = "int32",
         zeros: bool = True,
     ) -> Ints4d:
-        return self.alloc((d0, d1, d2, d3), dtype=dtype, zeros=zeros)
+        return cast(Ints4d, self.alloc((d0, d1, d2, d3), dtype=dtype, zeros=zeros))
 
     def alloc_i(
         self,
@@ -556,7 +556,7 @@ def alloc_i(
         dtype: Optional[DTypesInt] = "int32",
         zeros: bool = True,
     ) -> IntsXd:
-        return self.alloc(shape, dtype=dtype, zeros=zeros)
+        return cast(IntsXd, self.alloc(shape, dtype=dtype, zeros=zeros))
 
     def alloc(
         self,
@@ -564,7 +564,7 @@ def alloc(
         *,
         dtype: Optional[DTypes] = "float32",
         zeros: bool = True,
-    ) -> ArrayT:
+    ) -> ArrayXd:
         """Allocate an array of a certain shape."""
         if isinstance(shape, int):
             shape = (shape,)
diff --git a/thinc/initializers.py b/thinc/initializers.py
index 4842f4f08..1333911a3 100644
--- a/thinc/initializers.py
+++ b/thinc/initializers.py
@@ -75,7 +75,7 @@ def configure_glorot_uniform_init() -> Callable[[Shape], FloatsXd]:
 
 
 def zero_init(ops: Ops, shape: Shape) -> FloatsXd:
-    return ops.alloc(shape)
+    return ops.alloc_f(shape)
 
 
 @registry.initializers("zero_init.v1")
diff --git a/thinc/layers/hashembed.py b/thinc/layers/hashembed.py
index 677049cc2..e2bdc2e68 100644
--- a/thinc/layers/hashembed.py
+++ b/thinc/layers/hashembed.py
@@ -62,7 +62,7 @@ def forward(
     nV = vectors.shape[0]
     nO = vectors.shape[1]
     if len(ids) == 0:
-        output: Floats2d = model.ops.alloc((0, nO), dtype=vectors.dtype)
+        output: Floats2d = model.ops.alloc2f(0, nO, dtype=vectors.dtype)
     else:
         ids = model.ops.as_contig(ids, dtype="uint64")
         nN = ids.shape[0]
diff --git a/thinc/tests/layers/test_mxnet_wrapper.py b/thinc/tests/layers/test_mxnet_wrapper.py
index 69b0d5253..b954a8ec5 100644
--- a/thinc/tests/layers/test_mxnet_wrapper.py
+++ b/thinc/tests/layers/test_mxnet_wrapper.py
@@ -34,7 +34,7 @@ def answer() -> int:
 @pytest.fixture
 def X(input_size: int) -> Array2d:
     ops: Ops = get_current_ops()
-    return ops.alloc(shape=(1, input_size))
+    return cast(Array2d, ops.alloc(shape=(1, input_size)))
 
 
 @pytest.fixture
diff --git a/thinc/types.py b/thinc/types.py
index 69a51bcc7..629a79e82 100644
--- a/thinc/types.py
+++ b/thinc/types.py
@@ -162,7 +162,7 @@ def __bytes__(self) -> bytes: ...
     def __str__(self) -> str: ...
     def __repr__(self) -> str: ...
     def __copy__(self, order: str = ...): ...
-    def __deepcopy__(self, memo: dict) -> ArrayT: ...
+    def __deepcopy__(self: SelfT, memo: dict) -> SelfT: ...
     def __lt__(self, other): ...
     def __le__(self, other): ...
     def __eq__(self, other): ...
@@ -224,7 +224,7 @@ def clip(self, a_min: Any, a_max: Any, out: Optional[ArrayT]) -> ArrayT: ...
     def max(self, axis: int = -1, out: Optional[ArrayT] = None) -> ArrayT: ...
     # def mean(self, axis: int = -1, dtype: Optional[DTypes] = None, out: Optional[SelfT] = None, keepdims: bool = False) -> "Array": ...
     def min(self, axis: int = -1, out: Optional[ArrayT] = None) -> ArrayT: ...
-    def nonzero(self) -> ArrayT: ...
+    def nonzero(self: SelfT) -> SelfT: ...
     def prod(self, axis: int = -1, dtype: Optional[DTypes] = None, out: Optional[ArrayT] = None, keepdims: bool = False) -> ArrayT: ...
     def round(self, decimals: int = 0, out: Optional[ArrayT] = None) -> ArrayT: ...
     # def sum(self, axis: int = -1, dtype: Optional[DTypes] = None, out: Optional[ArrayT] = None, keepdims: bool = False) -> ArrayT: ...

From 36b691ff50b2ba980d90e21c0a839e50c32d380c Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 7 Oct 2022 08:52:18 +0200
Subject: [PATCH 77/78] Set version to v8.1.3 (#781)

---
 thinc/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/thinc/about.py b/thinc/about.py
index 7b1444a05..35cdb6adb 100644
--- a/thinc/about.py
+++ b/thinc/about.py
@@ -1,2 +1,2 @@
-__version__ = "8.1.2"
+__version__ = "8.1.3"
 __release__ = True

From 07b7a09b7027c23078316c401200421f05d79d87 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 10 Oct 2022 10:35:11 +0200
Subject: [PATCH 78/78] Update CI around conflicting extras requirements (#783)

* Update torch install, update package requirements after installing extra deps

* Only reinstall requirements

* Run test suite twice

* Check package requirements after extras

* Update thinc-apple-ops test for current macos jobs

* Move notebook extras

* Skip mypy in tests with extras

* Use torch<1.12.0

* Try to figure out numpy version (non)requirements

* More numpy version tests

* Adjust for all
---
 azure-pipelines.yml | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 8c80569cf..3789606a4 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -83,25 +83,37 @@ jobs:
 
   - script: |
       pip install -r requirements.txt
-      pip install "protobuf~=3.20.0" "tensorflow~=2.5.0"
-      pip install "mxnet; sys_platform != 'win32'"
-      pip install "torch==1.9.0+cpu" -f https://download.pytorch.org/whl/torch_stable.html
       pip install ipykernel pydot graphviz
       python -m ipykernel install --name thinc-notebook-tests --user
-    displayName: 'Install test dependencies'
+      python -m pytest --pyargs thinc --cov=thinc --cov-report=term
+    displayName: 'Run tests without extras'
+
+  - script: |
+      pip install "protobuf~=3.20.0" "tensorflow~=2.5.0"
+      pip install "mxnet; sys_platform != 'win32'"
+      pip install torch --extra-index-url https://download.pytorch.org/whl/cpu
+      # torch does not have a direct numpy requirement but is compiled against
+      # a newer version than the oldest supported numpy for windows and
+      # python 3.10; this version of numpy would not work with
+      # tensorflow~=2.5.0 as specified above, but there is no release for
+      # python 3.10 anyway
+      pip install "numpy~=1.23.0; python_version=='3.10' and sys_platform=='win32'"
+      pip install -r requirements.txt
+      pip uninstall -y mypy
+    displayName: 'Install extras for testing'
 
   - script: |
       python -m pytest --pyargs thinc --cov=thinc --cov-report=term
-    displayName: 'Run tests'
+    displayName: 'Run tests with extras'
 
   - script: |
       pip uninstall -y tensorflow
       pip install thinc-apple-ops
       python -m pytest --pyargs thinc_apple_ops
     displayName: 'Run tests for thinc-apple-ops'
-    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))
+    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))
 
   - script: |
       python -m pytest --pyargs thinc
     displayName: 'Run tests with thinc-apple-ops'
-    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.9'))
+    condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.10'))