Add TextCatReduce.v1 (explosion#13181)

* Add TextCatReduce.v1 This is a textcat classifier that pools the vectors generated by a tok2vec implementation and then applies a classifier to the pooled representation. Three reductions are supported for pooling: first, max, and mean. When multiple reductions are enabled, the reductions are concatenated before providing them to the classification layer. This model is a generalization of the TextCatCNN model, which only supports mean reductions and is a bit of a misnomer, because it can also be used with transformers. This change also reimplements TextCatCNN.v2 using the new TextCatReduce.v1 layer. * Doc fixes Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> * Fully specify `TextCatCNN` <-> `TextCatReduce` equivalence * Move TextCatCNN docs to legacy, in prep for moving to spacy-legacy * Add back a test for TextCatCNN.v2 * Replace TextCatCNN in pipe configurations and templates * Add an infobox to the `TextCatReduce` section with an `TextCatCNN` anchor * Add last reduction (`use_reduce_last`) * Remove non-working TextCatCNN Netlify redirect * Revert layer changes for the quickstart * Revert one more quickstart change * Remove unused import * Fix docstring * Fix setting name in error message --------- Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
jordankanter · May 21, 2024 · a96dfbf · a96dfbf
1 parent 3c76d52
commit a96dfbf
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 85 deletions.
diff --git a/spacy/errors.py b/spacy/errors.py
@@ -976,6 +976,9 @@ class Errors(metaclass=ErrorsWithCodes):
     E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
              "but only callbacks with one or three parameters are supported")
     E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
+    E1057 = ("The `TextCatReduce` architecture must be used with at least one "
+             "reduction. Please enable one of `use_reduce_first`, "
+             "`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")
 
     # v4 error strings
     E4000 = ("Expected a Doc as input, but got: '{type}'")

diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
@@ -502,9 +502,9 @@ def test_resize(name, textcat_config):
         ("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
         ("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
-        # CNN
-        ("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
-        ("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
+        # REDUCE
+        ("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
+        ("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
     ],
 )
 # fmt: on
@@ -752,12 +752,9 @@ def test_overfitting_IO_multi():
         # ENSEMBLE V2
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
-        # CNN V2
+        # CNN V2 (legacy)
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
-        # PARAMETRIC ATTENTION V1
-        ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
-        ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatParametricAttention.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
         # REDUCE V1
         ("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
         ("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),

diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx
@@ -1020,46 +1020,6 @@ but used an internal `tok2vec` instead of taking it as argument:
 
 ### spacy.TextCatBOW.v3 {id="TextCatBOW"}
 
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.TextCatCNN.v2"
-> exclusive_classes = false
-> nO = null
->
-> [model.tok2vec]
-> @architectures = "spacy.HashEmbedCNN.v2"
-> pretrained_vectors = null
-> width = 96
-> depth = 4
-> embed_size = 2000
-> window_size = 1
-> maxout_pieces = 3
-> subword_features = true
-> ```
-
-A neural network model where token vectors are calculated using a CNN. The
-vectors are mean pooled and used as features in a feed-forward network. This
-architecture is usually less accurate than the ensemble, but runs faster.
-
-| Name                | Description                                                                                                                                                                                    |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
-| `tok2vec`           | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~                                                                                                                                        |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
-
-<Accordion title="spacy.TextCatCNN.v1 definition" spaced>
-
-[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
-not yet resizable. Since v2, new labels can be added to this component, even
-after training.
-
-</Accordion>
-
-### spacy.TextCatBOW.v3 {id="TextCatBOW"}
-
 > #### Example Config
 >
 > ```ini
@@ -1096,44 +1056,6 @@ the others, but may not be as accurate, especially if texts are short.
 
 </Accordion>
 
-### spacy.TextCatParametricAttention.v1 {id="TextCatParametricAttention"}
-
-> #### Example Config
->
-> ```ini
-> [model]
-> @architectures = "spacy.TextCatParametricAttention.v1"
-> exclusive_classes = true
-> nO = null
->
-> [model.tok2vec]
-> @architectures = "spacy.Tok2Vec.v2"
->
-> [model.tok2vec.embed]
-> @architectures = "spacy.MultiHashEmbed.v2"
-> width = 64
-> rows = [2000, 2000, 1000, 1000, 1000, 1000]
-> attrs = ["ORTH", "LOWER", "PREFIX", "SUFFIX", "SHAPE", "ID"]
-> include_static_vectors = false
->
-> [model.tok2vec.encode]
-> @architectures = "spacy.MaxoutWindowEncoder.v2"
-> width = ${model.tok2vec.embed.width}
-> window_size = 1
-> maxout_pieces = 3
-> depth = 2
-> ```
-
-A neural network model that is built upon Tok2Vec and uses parametric attention
-to attend to tokens that are relevant to text classification.
-
-| Name                | Description                                                                                                                                                                                    |
-| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `tok2vec`           | The `tok2vec` layer to build the neural network upon. ~~Model[List[Doc], List[Floats2d]]~~                                                                                                     |
-| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~                                                                                                                                     |
-| `nO`                | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
-| **CREATES**         | The model using the architecture. ~~Model[List[Doc], Floats2d]~~                                                                                                                               |
-
 ### spacy.TextCatReduce.v1 {id="TextCatReduce"}
 
 > #### Example Config