Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add TextCatReduce.v1 #13181

Merged
merged 14 commits into from
Dec 21, 2023
Merged
3 changes: 3 additions & 0 deletions spacy/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -984,6 +984,9 @@ class Errors(metaclass=ErrorsWithCodes):
E1055 = ("The 'replace_listener' callback expects {num_params} parameters, "
"but only callbacks with one or three parameters are supported")
E1056 = ("The `TextCatBOW` architecture expects a length of at least 1, was {length}.")
E1057 = ("The `TextCatReduce` architecture must be used with at least one "
"reduction. Please enable one of `use_reduce_first`, "
"`use_reduce_last`, `use_reduce_max` or `use_reduce_mean`.")


# Deprecated model shortcuts, only used in errors and warnings
Expand Down
122 changes: 89 additions & 33 deletions spacy/ml/models/textcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
clone,
concatenate,
list2ragged,
reduce_first,
reduce_last,
reduce_max,
reduce_mean,
reduce_sum,
residual,
Expand Down Expand Up @@ -49,39 +52,15 @@ def build_simple_cnn_text_classifier(
outputs sum to 1. If exclusive_classes=False, a logistic non-linearity
is applied instead, so that outputs are in the range [0, 1].
"""
fill_defaults = {"b": 0, "W": 0}
with Model.define_operators({">>": chain}):
cnn = tok2vec >> list2ragged() >> reduce_mean()
nI = tok2vec.maybe_get_dim("nO")
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=nI)
fill_defaults["b"] = NEG_VALUE
resizable_layer: Model = resizable(
output_layer,
resize_layer=partial(
resize_linear_weighted, fill_defaults=fill_defaults
),
)
model = cnn >> resizable_layer
else:
output_layer = Linear(nO=nO, nI=nI)
resizable_layer = resizable(
output_layer,
resize_layer=partial(
resize_linear_weighted, fill_defaults=fill_defaults
),
)
model = cnn >> resizable_layer >> Logistic()
model.set_ref("output_layer", output_layer)
model.attrs["resize_output"] = partial(
resize_and_set_ref,
resizable_layer=resizable_layer,
)
model.set_ref("tok2vec", tok2vec)
if nO is not None:
model.set_dim("nO", cast(int, nO))
model.attrs["multi_label"] = not exclusive_classes
return model
return build_reduce_text_classifier(
tok2vec=tok2vec,
exclusive_classes=exclusive_classes,
use_reduce_first=False,
use_reduce_last=False,
use_reduce_max=False,
use_reduce_mean=True,
nO=nO,
)


def resize_and_set_ref(model, new_nO, resizable_layer):
Expand Down Expand Up @@ -230,3 +209,80 @@ def build_text_classifier_lowdata(
model = model >> Dropout(dropout)
model = model >> Logistic()
return model


@registry.architectures("spacy.TextCatReduce.v1")
def build_reduce_text_classifier(
tok2vec: Model,
exclusive_classes: bool,
use_reduce_first: bool,
use_reduce_last: bool,
use_reduce_max: bool,
use_reduce_mean: bool,
nO: Optional[int] = None,
) -> Model[List[Doc], Floats2d]:
"""Build a model that classifies pooled `Doc` representations.

Pooling is performed using reductions. Reductions are concatenated when
multiple reductions are used.

tok2vec (Model): the tok2vec layer to pool over.
exclusive_classes (bool): Whether or not classes are mutually exclusive.
use_reduce_first (bool): Pool by using the hidden representation of the
first token of a `Doc`.
use_reduce_last (bool): Pool by using the hidden representation of the
last token of a `Doc`.
use_reduce_max (bool): Pool by taking the maximum values of the hidden
representations of a `Doc`.
use_reduce_mean (bool): Pool by taking the mean of all hidden
representations of a `Doc`.
nO (Optional[int]): Number of classes.
"""

fill_defaults = {"b": 0, "W": 0}
reductions = []
if use_reduce_first:
reductions.append(reduce_first())
if use_reduce_last:
reductions.append(reduce_last())
if use_reduce_max:
reductions.append(reduce_max())
if use_reduce_mean:
reductions.append(reduce_mean())

if not len(reductions):
raise ValueError(Errors.E1057)

with Model.define_operators({">>": chain}):
cnn = tok2vec >> list2ragged() >> concatenate(*reductions)
nO_tok2vec = tok2vec.maybe_get_dim("nO")
nI = nO_tok2vec * len(reductions) if nO_tok2vec is not None else None
if exclusive_classes:
output_layer = Softmax(nO=nO, nI=nI)
fill_defaults["b"] = NEG_VALUE
resizable_layer: Model = resizable(
output_layer,
resize_layer=partial(
resize_linear_weighted, fill_defaults=fill_defaults
),
)
model = cnn >> resizable_layer
else:
output_layer = Linear(nO=nO, nI=nI)
resizable_layer = resizable(
output_layer,
resize_layer=partial(
resize_linear_weighted, fill_defaults=fill_defaults
),
)
model = cnn >> resizable_layer >> Logistic()
model.set_ref("output_layer", output_layer)
model.attrs["resize_output"] = partial(
resize_and_set_ref,
resizable_layer=resizable_layer,
)
model.set_ref("tok2vec", tok2vec)
if nO is not None:
model.set_dim("nO", cast(int, nO))
model.attrs["multi_label"] = not exclusive_classes
return model
6 changes: 5 additions & 1 deletion spacy/pipeline/textcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,8 +55,12 @@

single_label_cnn_config = """
[model]
@architectures = "spacy.TextCatCNN.v2"
@architectures = "spacy.TextCatReduce.v1"
exclusive_classes = true
use_reduce_first = false
use_reduce_last = false
use_reduce_max = false
use_reduce_mean = true

[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
Expand Down
6 changes: 5 additions & 1 deletion spacy/pipeline/textcat_multilabel.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,12 @@

multi_label_cnn_config = """
[model]
@architectures = "spacy.TextCatCNN.v2"
@architectures = "spacy.TextCatReduce.v1"
exclusive_classes = false
use_reduce_first = false
use_reduce_last = false
use_reduce_max = false
use_reduce_mean = true

[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
Expand Down
15 changes: 9 additions & 6 deletions spacy/tests/pipeline/test_textcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,8 +457,8 @@ def test_no_resize(name, textcat_config):
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
# CNN
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
],
)
# fmt: on
Expand All @@ -485,9 +485,9 @@ def test_resize(name, textcat_config):
("textcat", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "no_output_layer": True, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": False, "ngram_size": 3}),
("textcat_multilabel", {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "no_output_layer": True, "ngram_size": 3}),
# CNN
("textcat", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
# REDUCE
("textcat", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
("textcat_multilabel", {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
],
)
# fmt: on
Expand Down Expand Up @@ -701,9 +701,12 @@ def test_overfitting_IO_multi():
# ENSEMBLE V2
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": False, "ngram_size": 1, "no_output_layer": False}}),
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatEnsemble.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "linear_model": {"@architectures": "spacy.TextCatBOW.v3", "exclusive_classes": True, "ngram_size": 5, "no_output_layer": False}}),
# CNN V2
# CNN V2 (legacy)
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatCNN.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False}),
# REDUCE V1
("textcat", TRAIN_DATA_SINGLE_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": True, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
("textcat_multilabel", TRAIN_DATA_MULTI_LABEL, {"@architectures": "spacy.TextCatReduce.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "exclusive_classes": False, "use_reduce_first": True, "use_reduce_last": True, "use_reduce_max": True, "use_reduce_mean": True}),
],
)
# fmt: on
Expand Down
15 changes: 15 additions & 0 deletions spacy/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
build_Tok2Vec_model,
)
from spacy.ml.staticvectors import StaticVectors
from spacy.util import registry


def get_textcat_bow_kwargs():
Expand Down Expand Up @@ -284,3 +285,17 @@ def test_spancat_model_forward_backward(nO=5):
Y, backprop = model((docs, spans), is_train=True)
assert Y.shape == (spans.dataXd.shape[0], nO)
backprop(Y)


def test_textcat_reduce_invalid_args():
textcat_reduce = registry.architectures.get("spacy.TextCatReduce.v1")
tok2vec = make_test_tok2vec()
with pytest.raises(ValueError, match=r"must be used with at least one reduction"):
textcat_reduce(
tok2vec=tok2vec,
exclusive_classes=False,
use_reduce_first=False,
use_reduce_last=False,
use_reduce_max=False,
use_reduce_mean=False,
)
88 changes: 48 additions & 40 deletions website/docs/api/architectures.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -1018,46 +1018,6 @@ but used an internal `tok2vec` instead of taking it as argument:

</Accordion>

### spacy.TextCatCNN.v2 {id="TextCatCNN"}

> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.TextCatCNN.v2"
> exclusive_classes = false
> nO = null
>
> [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v2"
> pretrained_vectors = null
> width = 96
> depth = 4
> embed_size = 2000
> window_size = 1
> maxout_pieces = 3
> subword_features = true
> ```

A neural network model where token vectors are calculated using a CNN. The
vectors are mean pooled and used as features in a feed-forward network. This
architecture is usually less accurate than the ensemble, but runs faster.

| Name | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |

<Accordion title="spacy.TextCatCNN.v1 definition" spaced>

[TextCatCNN.v1](/api/legacy#TextCatCNN_v1) had the exact same signature, but was
not yet resizable. Since v2, new labels can be added to this component, even
after training.

</Accordion>

### spacy.TextCatBOW.v3 {id="TextCatBOW"}

> #### Example Config
Expand Down Expand Up @@ -1096,6 +1056,54 @@ the others, but may not be as accurate, especially if texts are short.

</Accordion>

### spacy.TextCatReduce.v1 {id="TextCatReduce"}

> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.TextCatReduce.v1"
> exclusive_classes = false
> use_reduce_first = false
> use_reduce_last = false
> use_reduce_max = false
> use_reduce_mean = true
> nO = null
>
> [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v2"
> pretrained_vectors = null
> width = 96
> depth = 4
> embed_size = 2000
> window_size = 1
> maxout_pieces = 3
> subword_features = true
> ```

A classifier that pools token hidden representations of each `Doc` using first,
max or mean reduction and then applies a classification layer. Reductions are
concatenated when multiple reductions are used.

<Infobox variant="warning" title="Relation to TextCatCNN" id="TextCatCNN">

`TextCatReduce` is a generalization of the older
[`TextCatCNN`](/api/legacy#TextCatCNN_v2) model. `TextCatCNN` always uses a mean
reduction, whereas `TextCatReduce` also supports first/max reductions.

</Infobox>

| Name | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `use_reduce_first` | Pool by using the hidden representation of the first token of a `Doc`. ~~bool~~ |
| `use_reduce_last` | Pool by using the hidden representation of the last token of a `Doc`. ~~bool~~ |
| `use_reduce_max` | Pool by taking the maximum values of the hidden representations of a `Doc`. ~~bool~~ |
| `use_reduce_mean` | Pool by taking the mean of all hidden representations of a `Doc`. ~~bool~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |

## Span classification architectures {id="spancat",source="spacy/ml/models/spancat.py"}

### spacy.SpanCategorizer.v1 {id="SpanCategorizer"}
Expand Down
Loading