From 98ed529a678bd300f67af85e5c618753b8150a41 Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Wed, 24 Nov 2021 21:06:10 +0530 Subject: [PATCH 01/23] Sentence Embedder API using sentence transformers --- flash/core/utilities/providers.py | 1 + flash/text/embeddings/__init__.py | 1 + flash/text/embeddings/backbones.py | 14 +++ flash/text/embeddings/data.py | 1 + flash/text/embeddings/model.py | 137 +++++++++++++++++++++++++++++ 5 files changed, 154 insertions(+) create mode 100644 flash/text/embeddings/__init__.py create mode 100644 flash/text/embeddings/backbones.py create mode 100644 flash/text/embeddings/data.py create mode 100644 flash/text/embeddings/model.py diff --git a/flash/core/utilities/providers.py b/flash/core/utilities/providers.py index 4c2af721a9..422b019992 100644 --- a/flash/core/utilities/providers.py +++ b/flash/core/utilities/providers.py @@ -42,6 +42,7 @@ def __str__(self): _LEARN2LEARN = Provider("learnables/learn2learn", "https://github.com/learnables/learn2learn") _PYSTICHE = Provider("pystiche/pystiche", "https://github.com/pystiche/pystiche") _HUGGINGFACE = Provider("Hugging Face/transformers", "https://github.com/huggingface/transformers") +_SENTENCE_TRANSFORMERS = Provider("UKPLab/sentence-transformers", "https://github.com/UKPLab/sentence-transformers") _FAIRSEQ = Provider("PyTorch/fairseq", "https://github.com/pytorch/fairseq") _OPEN3D_ML = Provider("Intelligent Systems Lab Org/Open3D-ML", "https://github.com/isl-org/Open3D-ML") _PYTORCHVIDEO = Provider("Facebook Research/PyTorchVideo", "https://github.com/facebookresearch/pytorchvideo") diff --git a/flash/text/embeddings/__init__.py b/flash/text/embeddings/__init__.py new file mode 100644 index 0000000000..c49d81d2ae --- /dev/null +++ b/flash/text/embeddings/__init__.py @@ -0,0 +1 @@ +from flash.text.embeddings.model import SentenceEmbedder diff --git a/flash/text/embeddings/backbones.py b/flash/text/embeddings/backbones.py new file mode 100644 index 0000000000..7ecc98adcc --- /dev/null +++ b/flash/text/embeddings/backbones.py @@ -0,0 +1,14 @@ +from flash.core.registry import ExternalRegistry, FlashRegistry +from flash.core.utilities.imports import _TEXT_AVAILABLE +from flash.core.utilities.providers import _SENTENCE_TRANSFORMERS + +SENTENCE_TRANSFORMERS_BACKBONE = FlashRegistry("backbones") + +if _TEXT_AVAILABLE: + from sentence_transformers import SentenceTransformer + + SENTENCE_TRANSFORMERS_BACKBONE += ExternalRegistry( + SentenceTransformer, + "backbones", + _SENTENCE_TRANSFORMERS, + ) diff --git a/flash/text/embeddings/data.py b/flash/text/embeddings/data.py new file mode 100644 index 0000000000..16a20a3dac --- /dev/null +++ b/flash/text/embeddings/data.py @@ -0,0 +1 @@ +from typing import Callable, Dict, Optional, Union diff --git a/flash/text/embeddings/model.py b/flash/text/embeddings/model.py new file mode 100644 index 0000000000..be05a77ebb --- /dev/null +++ b/flash/text/embeddings/model.py @@ -0,0 +1,137 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import warnings +from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union + +import numpy as np +import torch +import torchmetrics +from pytorch_lightning import Callback +from pytorch_lightning.utilities import rank_zero_info +from sentence_transformers import SentenceTransformer +from torch import nn, Tensor +from torch.optim.lr_scheduler import _LRScheduler +from torchmetrics import Metric + +from flash.core.data.data_source import DefaultDataKeys +from flash.core.data.process import Serializer +from flash.core.finetuning import FlashBaseFinetuning +from flash.core.model import Task +from flash.core.registry import FlashRegistry +from flash.text.embeddings.backbones import SENTENCE_TRANSFORMERS_BACKBONE + + +class SentenceEmbedder(Task): + """The ``SentenceEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and + validation. For more details, see `embeddings`. + + You can change the backbone to any question answering model from `UKPLab/sentence-transformers + `_ using the ``backbone`` + argument. + + .. note:: When changing the backbone, make sure you pass in the same backbone to the :class:`~flash.Task` and the + :class:`~flash.core.data.data_module.DataModule` object! Since this is a Sentence Transformers task, make sure you + use a Sentence Transformers model. + + Args: + backbone: backbone model to use for the task. + loss_fn: Loss function for training. + optimizer: Optimizer to use for training, defaults to `torch.optim.Adam`. + optimizer_kwargs: Additional kwargs to use when creating the optimizer (if not passed as an instance). + scheduler: The scheduler or scheduler class to use. + scheduler_kwargs: Additional kwargs to use when creating the scheduler (if not passed as an instance). + metrics: Metrics to compute for training and evaluation. Defauls to calculating the ROUGE metric. + Changing this argument currently has no effect. + learning_rate: Learning rate to use for training, defaults to `3e-4` + enable_ort: Enable Torch ONNX Runtime Optimization: https://onnxruntime.ai/docs/#onnx-runtime-for-training + """ + + required_extras: str = "text" + + backbones: FlashRegistry = SENTENCE_TRANSFORMERS_BACKBONE + + def __init__( + self, + backbone: str = "all-MiniLM-L6-v2", + enable_ort: bool = False, + ): + + os.environ["TOKENIZERS_PARALLELISM"] = "TRUE" + # disable HF thousand warnings + warnings.simplefilter("ignore") + # set os environ variable for multiprocesses + os.environ["PYTHONWARNINGS"] = "ignore" + super().__init__( + ) + self.model = self.backbones.get(backbone)() + + def generate_embeddings( + self, + sentences: Union[str, List[str]], + batch_size: int = 32, + show_progress_bar: bool = None, + output_value: str = "sentence_embedding", + convert_to_numpy: bool = True, + convert_to_tensor: bool = False, + device: str = None, + normalize_embeddings: bool = False, + ) -> Union[List[Tensor], np.ndarray, Tensor]: + + return self.model.encode( + sentences=sentences, + batch_size=batch_size, + show_progress_bar=show_progress_bar, + output_value=output_value, + convert_to_numpy=convert_to_numpy, + convert_to_tensor=convert_to_tensor, + device=device, + normalize_embeddings=normalize_embeddings, + ) + + @property + def backbone(self): + return self.model.base_model + + def training_step(self, batch: Any, batch_idx: int) -> Any: + """For the training step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` and + :attr:`~flash.core.data.data_source.DefaultDataKeys.TARGET` keys from the input and forward them to the + :meth:`~flash.core.model.Task.training_step`.""" + batch = (batch[DefaultDataKeys.INPUT], batch[DefaultDataKeys.TARGET]) + return super().training_step(batch, batch_idx) + + def validation_step(self, batch: Any, batch_idx: int) -> Any: + """For the validation step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` and + :attr:`~flash.core.data.data_source.DefaultDataKeys.TARGET` keys from the input and forward them to the + :meth:`~flash.core.model.Task.validation_step`.""" + batch = (batch[DefaultDataKeys.INPUT], batch[DefaultDataKeys.TARGET]) + return super().validation_step(batch, batch_idx) + + def test_step(self, batch: Any, batch_idx: int) -> Any: + """For the test step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` and + :attr:`~flash.core.data.data_source.DefaultDataKeys.TARGET` keys from the input and forward them to the + :meth:`~flash.core.model.Task.test_step`.""" + batch = (batch[DefaultDataKeys.INPUT], batch[DefaultDataKeys.TARGET]) + return super().test_step(batch, batch_idx) + + def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any: + """For the predict step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` key + from the input and forward it to the :meth:`~flash.core.model.Task.predict_step`.""" + batch = batch[DefaultDataKeys.INPUT] + return super().predict_step(batch, batch_idx, dataloader_idx=dataloader_idx) + + def forward(self, x) -> torch.Tensor: + """First call the backbone, then the model head.""" + x = self.backbone(x) + return self.head(x) From 83fbf1efafe416c7c6061b4da6400b08ef43eba2 Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Wed, 24 Nov 2021 21:08:05 +0530 Subject: [PATCH 02/23] remove train, test and pred step --- flash/text/embeddings/model.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/flash/text/embeddings/model.py b/flash/text/embeddings/model.py index be05a77ebb..828a62087f 100644 --- a/flash/text/embeddings/model.py +++ b/flash/text/embeddings/model.py @@ -104,33 +104,6 @@ def generate_embeddings( def backbone(self): return self.model.base_model - def training_step(self, batch: Any, batch_idx: int) -> Any: - """For the training step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` and - :attr:`~flash.core.data.data_source.DefaultDataKeys.TARGET` keys from the input and forward them to the - :meth:`~flash.core.model.Task.training_step`.""" - batch = (batch[DefaultDataKeys.INPUT], batch[DefaultDataKeys.TARGET]) - return super().training_step(batch, batch_idx) - - def validation_step(self, batch: Any, batch_idx: int) -> Any: - """For the validation step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` and - :attr:`~flash.core.data.data_source.DefaultDataKeys.TARGET` keys from the input and forward them to the - :meth:`~flash.core.model.Task.validation_step`.""" - batch = (batch[DefaultDataKeys.INPUT], batch[DefaultDataKeys.TARGET]) - return super().validation_step(batch, batch_idx) - - def test_step(self, batch: Any, batch_idx: int) -> Any: - """For the test step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` and - :attr:`~flash.core.data.data_source.DefaultDataKeys.TARGET` keys from the input and forward them to the - :meth:`~flash.core.model.Task.test_step`.""" - batch = (batch[DefaultDataKeys.INPUT], batch[DefaultDataKeys.TARGET]) - return super().test_step(batch, batch_idx) - - def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any: - """For the predict step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` key - from the input and forward it to the :meth:`~flash.core.model.Task.predict_step`.""" - batch = batch[DefaultDataKeys.INPUT] - return super().predict_step(batch, batch_idx, dataloader_idx=dataloader_idx) - def forward(self, x) -> torch.Tensor: """First call the backbone, then the model head.""" x = self.backbone(x) From 4ad3abb2ccd1e44d852a0dc1232c4a655ca7b886 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 24 Nov 2021 15:39:33 +0000 Subject: [PATCH 03/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- flash/text/embeddings/model.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/flash/text/embeddings/model.py b/flash/text/embeddings/model.py index 828a62087f..9b5d22cdf4 100644 --- a/flash/text/embeddings/model.py +++ b/flash/text/embeddings/model.py @@ -73,8 +73,7 @@ def __init__( warnings.simplefilter("ignore") # set os environ variable for multiprocesses os.environ["PYTHONWARNINGS"] = "ignore" - super().__init__( - ) + super().__init__() self.model = self.backbones.get(backbone)() def generate_embeddings( From 8cd939567247e987bae267335a07988557862e85 Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Sun, 5 Dec 2021 17:06:23 +0530 Subject: [PATCH 04/23] sentence embedders with forward step and predict step --- flash/text/embeddings/backbones.py | 20 ++- flash/text/embeddings/data.py | 221 ++++++++++++++++++++++++++++- flash/text/embeddings/model.py | 161 ++++++++++++++++----- requirements.txt | 1 + 4 files changed, 363 insertions(+), 40 deletions(-) diff --git a/flash/text/embeddings/backbones.py b/flash/text/embeddings/backbones.py index 7ecc98adcc..002b1c69d3 100644 --- a/flash/text/embeddings/backbones.py +++ b/flash/text/embeddings/backbones.py @@ -1,14 +1,24 @@ from flash.core.registry import ExternalRegistry, FlashRegistry from flash.core.utilities.imports import _TEXT_AVAILABLE -from flash.core.utilities.providers import _SENTENCE_TRANSFORMERS +from flash.core.utilities.providers import _HUGGINGFACE SENTENCE_TRANSFORMERS_BACKBONE = FlashRegistry("backbones") if _TEXT_AVAILABLE: - from sentence_transformers import SentenceTransformer + from transformers import AutoModel, AutoTokenizer, AutoConfig - SENTENCE_TRANSFORMERS_BACKBONE += ExternalRegistry( - SentenceTransformer, + AUTOMODEL_BACKBONE = ExternalRegistry( + AutoModel.from_pretrained, "backbones", - _SENTENCE_TRANSFORMERS, + _HUGGINGFACE, ) + AUTOTOKENIZER_BACKBONE= ExternalRegistry( + AutoTokenizer.from_pretrained, + "backbones", + _HUGGINGFACE, + ) + AUTOCONFIG_BACKBONE=ExternalRegistry( + AutoConfig.from_pretrained, + "backbones", + _HUGGINGFACE, + ) diff --git a/flash/text/embeddings/data.py b/flash/text/embeddings/data.py index 16a20a3dac..84232df9e0 100644 --- a/flash/text/embeddings/data.py +++ b/flash/text/embeddings/data.py @@ -1 +1,220 @@ -from typing import Callable, Dict, Optional, Union +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from functools import partial +from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union + +import torch +from pandas.core.frame import DataFrame + +import flash +from flash.core.data.auto_dataset import AutoDataset +from flash.core.data.io.input import DataKeys, Input, LabelsState +from flash.core.utilities.imports import _TEXT_AVAILABLE, requires + +if _TEXT_AVAILABLE: + from datasets import Dataset, load_dataset + from transformers import AutoTokenizer + + + +class TextInput(Input): + @requires("text") + def __init__(self, backbone: str, max_length: int = 128): + super().__init__() + + self.backbone = backbone + self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=True) + self.max_length = max_length + + def _tokenize_fn( + self, + ex: Union[Dict[str, str], str], + input: Optional[str] = None, + ) -> Callable: + """This function is used to tokenize sentences using the provided tokenizer.""" + return self.tokenizer(ex[input], max_length=self.max_length, truncation=True, padding="max_length") + + @staticmethod + def _transform_label(label_to_class_mapping: Dict[str, int], target: str, ex: Dict[str, Union[int, str]]): + ex[target] = label_to_class_mapping[ex[target]] + return ex + + @staticmethod + def _multilabel_target(targets: List[str], element: Dict[str, Any]) -> Dict[str, Any]: + targets = [element.pop(target) for target in targets] + element[DataKeys.TARGET] = targets + return element + + def _to_hf_dataset(self, data) -> Sequence[Mapping[str, Any]]: + """account for flash CI testing context.""" + hf_dataset, *other = self.to_hf_dataset(data) + + if flash._IS_TESTING and not torch.cuda.is_available(): + # NOTE: must subset in this way to return a Dataset + hf_dataset = hf_dataset.select(range(20)) + + return (hf_dataset, *other) + + def load_data( + self, + data: Tuple[str, Union[str, List[str]], Union[str, List[str]]], + dataset: Optional[Any] = None, + ) -> Sequence[Mapping[str, Any]]: + """Loads data into HuggingFace datasets.Dataset.""" + + hf_dataset, input, *other = self._to_hf_dataset(data) + + if not self.predicting: + target: Union[str, List[str]] = other.pop() + if isinstance(target, List): + # multi-target + dataset.multi_label = True + hf_dataset = hf_dataset.map(partial(self._multilabel_target, target)) # NOTE: renames target column + dataset.num_classes = len(target) + self.set_state(LabelsState(target)) + else: + dataset.multi_label = False + if self.training: + labels = list(sorted(list(set(hf_dataset[target])))) + dataset.num_classes = len(labels) + self.set_state(LabelsState(labels)) + + labels = self.get_state(LabelsState) + + # convert labels to ids (note: the target column get overwritten) + if labels is not None: + labels = labels.labels + label_to_class_mapping = {v: k for k, v in enumerate(labels)} + hf_dataset = hf_dataset.map(partial(self._transform_label, label_to_class_mapping, target)) + + # rename label column + hf_dataset = hf_dataset.rename_column(target, DataKeys.TARGET) + + # remove extra columns + extra_columns = set(hf_dataset.column_names) - {input, DataKeys.TARGET} + hf_dataset = hf_dataset.remove_columns(extra_columns) + + # tokenize + hf_dataset = hf_dataset.map(partial(self._tokenize_fn, input=input), batched=True, remove_columns=[input]) + + # set format + hf_dataset.set_format("torch") + + return hf_dataset + + def predict_load_data(self, data: Any, dataset: AutoDataset): + return self.load_data(data, dataset) + + def __getstate__(self): # TODO: Find out why this is being pickled + state = self.__dict__.copy() + state.pop("tokenizer") + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) + + +class TextCSVInput(TextInput): + def to_hf_dataset(self, data: Tuple[str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]: + file, *other = data + dataset_dict = load_dataset("csv", data_files={"train": str(file)}) + return (dataset_dict["train"], *other) + + +class TextJSONInput(TextInput): + def to_hf_dataset(self, data: Tuple[str, str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]: + file, *other, field = data + dataset_dict = load_dataset("json", data_files={"train": str(file)}, field=field) + return (dataset_dict["train"], *other) + + +class TextDataFrameInput(TextInput): + def to_hf_dataset(self, data: Tuple[DataFrame, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]: + df, *other = data + hf_dataset = Dataset.from_pandas(df) + return (hf_dataset, *other) + + +class TextParquetInput(TextInput): + def to_hf_dataset(self, data: Tuple[str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]: + file, *other = data + hf_dataset = Dataset.from_parquet(str(file)) + return (hf_dataset, *other) + + +class TextHuggingFaceDatasetInput(TextInput): + def to_hf_dataset(self, data: Tuple[str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]: + hf_dataset, *other = data + return (hf_dataset, *other) + + +class TextListInput(TextInput): + def to_hf_dataset( + self, data: Union[Tuple[List[str], List[str]], List[str]] + ) -> Tuple[Sequence[Mapping[str, Any]], Optional[List[str]]]: + + if isinstance(data, tuple): + input_list, target_list = data + # NOTE: here we already deal with multilabels + # NOTE: here we already rename to correct column names + hf_dataset = Dataset.from_dict({DataKeys.INPUT: input_list, DataKeys.TARGET: target_list}) + return hf_dataset, target_list + + # predicting + hf_dataset = Dataset.from_dict({DataKeys.INPUT: data}) + + return (hf_dataset,) + + def load_data( + self, + data: Tuple[List[str], Union[List[Any], List[List[Any]]]], + dataset: Optional[Any] = None, + ) -> Sequence[Mapping[str, Any]]: + + hf_dataset, *other = self._to_hf_dataset(data) + + if not self.predicting: + target_list = other.pop() + if isinstance(target_list[0], List): + # multi-target_list + dataset.multi_label = True + dataset.num_classes = len(target_list[0]) + self.set_state(LabelsState(target_list)) + else: + dataset.multi_label = False + if self.training: + labels = list(sorted(list(set(hf_dataset[DataKeys.TARGET])))) + dataset.num_classes = len(labels) + self.set_state(LabelsState(labels)) + + labels = self.get_state(LabelsState) + + # convert labels to ids + if labels is not None: + labels = labels.labels + label_to_class_mapping = {v: k for k, v in enumerate(labels)} + # happens in-place and keeps the target column name + hf_dataset = hf_dataset.map(partial(self._transform_label, label_to_class_mapping, DataKeys.TARGET)) + + # tokenize + hf_dataset = hf_dataset.map(partial(self._tokenize_fn, input=DataKeys.INPUT), batched=True) + + # set format + hf_dataset = hf_dataset.remove_columns([DataKeys.INPUT]) # just leave the numerical columns + hf_dataset.set_format("torch") + + return hf_dataset + + diff --git a/flash/text/embeddings/model.py b/flash/text/embeddings/model.py index 9b5d22cdf4..090a581d90 100644 --- a/flash/text/embeddings/model.py +++ b/flash/text/embeddings/model.py @@ -14,24 +14,24 @@ import os import warnings from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union - +import logging import numpy as np import torch import torchmetrics from pytorch_lightning import Callback from pytorch_lightning.utilities import rank_zero_info -from sentence_transformers import SentenceTransformer from torch import nn, Tensor from torch.optim.lr_scheduler import _LRScheduler from torchmetrics import Metric - +from tqdm.autonotebook import trange from flash.core.data.data_source import DefaultDataKeys from flash.core.data.process import Serializer from flash.core.finetuning import FlashBaseFinetuning from flash.core.model import Task from flash.core.registry import FlashRegistry -from flash.text.embeddings.backbones import SENTENCE_TRANSFORMERS_BACKBONE +from flash.text.embeddings.backbones import AUTOMODEL_BACKBONE,AUTOTOKENIZER_BACKBONE,AUTOCONFIG_BACKBONE +logger = logging.getLogger(__name__) class SentenceEmbedder(Task): """The ``SentenceEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and @@ -60,50 +60,143 @@ class SentenceEmbedder(Task): required_extras: str = "text" - backbones: FlashRegistry = SENTENCE_TRANSFORMERS_BACKBONE + AutoModel_Backbones: FlashRegistry = AUTOMODEL_BACKBONE + AutoTokenizer_Backbones: FlashRegistry = AUTOTOKENIZER_BACKBONE + AutoConfig_Backbones: FlashRegistry = AUTOCONFIG_BACKBONE def __init__( self, - backbone: str = "all-MiniLM-L6-v2", + model_backbone: str = "sentence-transformers/all-MiniLM-L6-v2", + tokenizer_backbone:str = "sentence-transformers/all-MiniLM-L6-v2", enable_ort: bool = False, ): - os.environ["TOKENIZERS_PARALLELISM"] = "TRUE" # disable HF thousand warnings warnings.simplefilter("ignore") # set os environ variable for multiprocesses os.environ["PYTHONWARNINGS"] = "ignore" super().__init__() - self.model = self.backbones.get(backbone)() - - def generate_embeddings( - self, - sentences: Union[str, List[str]], - batch_size: int = 32, - show_progress_bar: bool = None, - output_value: str = "sentence_embedding", - convert_to_numpy: bool = True, - convert_to_tensor: bool = False, - device: str = None, - normalize_embeddings: bool = False, - ) -> Union[List[Tensor], np.ndarray, Tensor]: - - return self.model.encode( - sentences=sentences, - batch_size=batch_size, - show_progress_bar=show_progress_bar, - output_value=output_value, - convert_to_numpy=convert_to_numpy, - convert_to_tensor=convert_to_tensor, - device=device, - normalize_embeddings=normalize_embeddings, - ) + + self.config=self.AutoConfig_Backbones.get(model_backbone) + self.auto_model = self.AutoModel_Backbones.get(model_backbone) + self.tokenzier=self.AutoTokenizer_Backbones.get(tokenizer_backbone) + + if tokenizer_backbone is not None: + self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__ + + def predict_step(self, sentences: Union[str, List[str]], + batch_size: int = 32, + show_progress_bar: bool = None, + output_value: str = 'sentence_embedding', + convert_to_numpy: bool = True, + convert_to_tensor: bool = False, + device: str = None, + normalize_embeddings: bool = False) -> Union[List[Tensor], np.ndarray, Tensor]: + """ + Computes sentence embeddings + :param sentences: the sentences to embed + :param batch_size: the batch size used for the computation + :param show_progress_bar: Output a progress bar when encode sentences + :param output_value: Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values + :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors. + :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy + :param device: Which torch.device to use for the computation + :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used. + :return: + By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned. + """ + self.eval() + if show_progress_bar is None: + show_progress_bar = (logger.getEffectiveLevel()==logging.INFO or logger.getEffectiveLevel()==logging.DEBUG) + + if convert_to_tensor: + convert_to_numpy = False + + if output_value != 'sentence_embedding': + convert_to_tensor = False + convert_to_numpy = False + + input_was_string = False + if isinstance(sentences, str) or not hasattr(sentences, '__len__'): #Cast an individual sentence to a list with length 1 + sentences = [sentences] + input_was_string = True + + if device is None: + device = self._target_device + + self.to(device) + + all_embeddings = [] + length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences]) + sentences_sorted = [sentences[idx] for idx in length_sorted_idx] + + for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar): + sentences_batch = sentences_sorted[start_index:start_index+batch_size] + features = self.tokenize(sentences_batch) + + with torch.no_grad(): + out_features = self.forward(features) + + if output_value == 'token_embeddings': + embeddings = [] + for token_emb, attention in zip(out_features[output_value], out_features['attention_mask']): + last_mask_id = len(attention)-1 + while last_mask_id > 0 and attention[last_mask_id].item() == 0: + last_mask_id -= 1 + + embeddings.append(token_emb[0:last_mask_id+1]) + elif output_value is None: #Return all outputs + embeddings = [] + for sent_idx in range(len(out_features['sentence_embedding'])): + row = {name: out_features[name][sent_idx] for name in out_features} + embeddings.append(row) + else: #Sentence embeddings + embeddings = out_features[output_value] + embeddings = embeddings.detach() + if normalize_embeddings: + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + # fixes for #522 and #487 to avoid oom problems on gpu with large datasets + if convert_to_numpy: + embeddings = embeddings.cpu() + + all_embeddings.extend(embeddings) + + all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)] + + if convert_to_tensor: + all_embeddings = torch.stack(all_embeddings) + elif convert_to_numpy: + all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings]) + + if input_was_string: + all_embeddings = all_embeddings[0] + + return all_embeddings @property def backbone(self): return self.model.base_model - def forward(self, x) -> torch.Tensor: + def forward(self,batch: Dict[str, torch.Tensor]) -> torch.Tensor: """First call the backbone, then the model head.""" - x = self.backbone(x) - return self.head(x) + + trans_features = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']} + if 'token_type_ids' in batch: + trans_features['token_type_ids'] = batch['token_type_ids'] + + output_states = self.auto_model(**trans_features, return_dict=False) + output_tokens = output_states[0] + + batch.update({'token_embeddings': output_tokens, 'attention_mask': batch['attention_mask']}) + + if self.auto_model.config.output_hidden_states: + all_layer_idx = 2 + if len(output_states) < 3: #Some models only output last_hidden_states and all_hidden_states + all_layer_idx = 1 + + hidden_states = output_states[all_layer_idx] + batch.update({'all_layer_embeddings': hidden_states}) + + return batch + diff --git a/requirements.txt b/requirements.txt index 0920edcb4c..333125ff76 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,3 +7,4 @@ pyDeprecate pandas>=1.1.0 jsonargparse[signatures]>=3.17.0 click>=7.1.2 +tqdm From 34f39d1bd1c049875486918d7b5ae4818b1eb28a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 5 Dec 2021 11:37:21 +0000 Subject: [PATCH 05/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- flash/text/embeddings/backbones.py | 14 +- flash/text/embeddings/data.py | 3 - flash/text/embeddings/model.py | 229 +++++++++++++++-------------- 3 files changed, 126 insertions(+), 120 deletions(-) diff --git a/flash/text/embeddings/backbones.py b/flash/text/embeddings/backbones.py index 002b1c69d3..6fbf4036f2 100644 --- a/flash/text/embeddings/backbones.py +++ b/flash/text/embeddings/backbones.py @@ -5,20 +5,20 @@ SENTENCE_TRANSFORMERS_BACKBONE = FlashRegistry("backbones") if _TEXT_AVAILABLE: - from transformers import AutoModel, AutoTokenizer, AutoConfig + from transformers import AutoConfig, AutoModel, AutoTokenizer AUTOMODEL_BACKBONE = ExternalRegistry( AutoModel.from_pretrained, "backbones", _HUGGINGFACE, ) - AUTOTOKENIZER_BACKBONE= ExternalRegistry( + AUTOTOKENIZER_BACKBONE = ExternalRegistry( AutoTokenizer.from_pretrained, "backbones", _HUGGINGFACE, ) - AUTOCONFIG_BACKBONE=ExternalRegistry( - AutoConfig.from_pretrained, - "backbones", - _HUGGINGFACE, - ) + AUTOCONFIG_BACKBONE = ExternalRegistry( + AutoConfig.from_pretrained, + "backbones", + _HUGGINGFACE, + ) diff --git a/flash/text/embeddings/data.py b/flash/text/embeddings/data.py index 84232df9e0..1be078354d 100644 --- a/flash/text/embeddings/data.py +++ b/flash/text/embeddings/data.py @@ -25,7 +25,6 @@ if _TEXT_AVAILABLE: from datasets import Dataset, load_dataset from transformers import AutoTokenizer - class TextInput(Input): @@ -216,5 +215,3 @@ def load_data( hf_dataset.set_format("torch") return hf_dataset - - diff --git a/flash/text/embeddings/model.py b/flash/text/embeddings/model.py index 090a581d90..f1b2165fae 100644 --- a/flash/text/embeddings/model.py +++ b/flash/text/embeddings/model.py @@ -11,10 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import logging import os import warnings from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union -import logging + import numpy as np import torch import torchmetrics @@ -24,15 +25,17 @@ from torch.optim.lr_scheduler import _LRScheduler from torchmetrics import Metric from tqdm.autonotebook import trange + from flash.core.data.data_source import DefaultDataKeys from flash.core.data.process import Serializer from flash.core.finetuning import FlashBaseFinetuning from flash.core.model import Task from flash.core.registry import FlashRegistry -from flash.text.embeddings.backbones import AUTOMODEL_BACKBONE,AUTOTOKENIZER_BACKBONE,AUTOCONFIG_BACKBONE +from flash.text.embeddings.backbones import AUTOCONFIG_BACKBONE, AUTOMODEL_BACKBONE, AUTOTOKENIZER_BACKBONE logger = logging.getLogger(__name__) + class SentenceEmbedder(Task): """The ``SentenceEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and validation. For more details, see `embeddings`. @@ -67,7 +70,7 @@ class SentenceEmbedder(Task): def __init__( self, model_backbone: str = "sentence-transformers/all-MiniLM-L6-v2", - tokenizer_backbone:str = "sentence-transformers/all-MiniLM-L6-v2", + tokenizer_backbone: str = "sentence-transformers/all-MiniLM-L6-v2", enable_ort: bool = False, ): os.environ["TOKENIZERS_PARALLELISM"] = "TRUE" @@ -76,127 +79,133 @@ def __init__( # set os environ variable for multiprocesses os.environ["PYTHONWARNINGS"] = "ignore" super().__init__() - - self.config=self.AutoConfig_Backbones.get(model_backbone) + + self.config = self.AutoConfig_Backbones.get(model_backbone) self.auto_model = self.AutoModel_Backbones.get(model_backbone) - self.tokenzier=self.AutoTokenizer_Backbones.get(tokenizer_backbone) - + self.tokenzier = self.AutoTokenizer_Backbones.get(tokenizer_backbone) + if tokenizer_backbone is not None: self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__ - - def predict_step(self, sentences: Union[str, List[str]], - batch_size: int = 32, - show_progress_bar: bool = None, - output_value: str = 'sentence_embedding', - convert_to_numpy: bool = True, - convert_to_tensor: bool = False, - device: str = None, - normalize_embeddings: bool = False) -> Union[List[Tensor], np.ndarray, Tensor]: - """ - Computes sentence embeddings - :param sentences: the sentences to embed - :param batch_size: the batch size used for the computation - :param show_progress_bar: Output a progress bar when encode sentences - :param output_value: Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values - :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors. - :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy - :param device: Which torch.device to use for the computation - :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used. - :return: - By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned. - """ - self.eval() - if show_progress_bar is None: - show_progress_bar = (logger.getEffectiveLevel()==logging.INFO or logger.getEffectiveLevel()==logging.DEBUG) - - if convert_to_tensor: - convert_to_numpy = False - - if output_value != 'sentence_embedding': - convert_to_tensor = False - convert_to_numpy = False - - input_was_string = False - if isinstance(sentences, str) or not hasattr(sentences, '__len__'): #Cast an individual sentence to a list with length 1 - sentences = [sentences] - input_was_string = True - - if device is None: - device = self._target_device - - self.to(device) - - all_embeddings = [] - length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences]) - sentences_sorted = [sentences[idx] for idx in length_sorted_idx] - - for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar): - sentences_batch = sentences_sorted[start_index:start_index+batch_size] - features = self.tokenize(sentences_batch) - - with torch.no_grad(): - out_features = self.forward(features) - - if output_value == 'token_embeddings': - embeddings = [] - for token_emb, attention in zip(out_features[output_value], out_features['attention_mask']): - last_mask_id = len(attention)-1 - while last_mask_id > 0 and attention[last_mask_id].item() == 0: - last_mask_id -= 1 - - embeddings.append(token_emb[0:last_mask_id+1]) - elif output_value is None: #Return all outputs - embeddings = [] - for sent_idx in range(len(out_features['sentence_embedding'])): - row = {name: out_features[name][sent_idx] for name in out_features} - embeddings.append(row) - else: #Sentence embeddings - embeddings = out_features[output_value] - embeddings = embeddings.detach() - if normalize_embeddings: - embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) - - # fixes for #522 and #487 to avoid oom problems on gpu with large datasets - if convert_to_numpy: - embeddings = embeddings.cpu() - - all_embeddings.extend(embeddings) - - all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)] - - if convert_to_tensor: - all_embeddings = torch.stack(all_embeddings) - elif convert_to_numpy: - all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings]) - - if input_was_string: - all_embeddings = all_embeddings[0] - - return all_embeddings + + def predict_step( + self, + sentences: Union[str, List[str]], + batch_size: int = 32, + show_progress_bar: bool = None, + output_value: str = "sentence_embedding", + convert_to_numpy: bool = True, + convert_to_tensor: bool = False, + device: str = None, + normalize_embeddings: bool = False, + ) -> Union[List[Tensor], np.ndarray, Tensor]: + """Computes sentence embeddings. + + :param sentences: the sentences to embed + :param batch_size: the batch size used for the computation + :param show_progress_bar: Output a progress bar when encode sentences + :param output_value: Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values + :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors. + :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy + :param device: Which torch.device to use for the computation + :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used. + :return: + By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned. + """ + self.eval() + if show_progress_bar is None: + show_progress_bar = ( + logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG + ) + + if convert_to_tensor: + convert_to_numpy = False + + if output_value != "sentence_embedding": + convert_to_tensor = False + convert_to_numpy = False + + input_was_string = False + if isinstance(sentences, str) or not hasattr( + sentences, "__len__" + ): # Cast an individual sentence to a list with length 1 + sentences = [sentences] + input_was_string = True + + if device is None: + device = self._target_device + + self.to(device) + + all_embeddings = [] + length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences]) + sentences_sorted = [sentences[idx] for idx in length_sorted_idx] + + for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar): + sentences_batch = sentences_sorted[start_index : start_index + batch_size] + features = self.tokenize(sentences_batch) + + with torch.no_grad(): + out_features = self.forward(features) + + if output_value == "token_embeddings": + embeddings = [] + for token_emb, attention in zip(out_features[output_value], out_features["attention_mask"]): + last_mask_id = len(attention) - 1 + while last_mask_id > 0 and attention[last_mask_id].item() == 0: + last_mask_id -= 1 + + embeddings.append(token_emb[0 : last_mask_id + 1]) + elif output_value is None: # Return all outputs + embeddings = [] + for sent_idx in range(len(out_features["sentence_embedding"])): + row = {name: out_features[name][sent_idx] for name in out_features} + embeddings.append(row) + else: # Sentence embeddings + embeddings = out_features[output_value] + embeddings = embeddings.detach() + if normalize_embeddings: + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) + + # fixes for #522 and #487 to avoid oom problems on gpu with large datasets + if convert_to_numpy: + embeddings = embeddings.cpu() + + all_embeddings.extend(embeddings) + + all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)] + + if convert_to_tensor: + all_embeddings = torch.stack(all_embeddings) + elif convert_to_numpy: + all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings]) + + if input_was_string: + all_embeddings = all_embeddings[0] + + return all_embeddings @property def backbone(self): return self.model.base_model - def forward(self,batch: Dict[str, torch.Tensor]) -> torch.Tensor: + def forward(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor: """First call the backbone, then the model head.""" - trans_features = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']} - if 'token_type_ids' in batch: - trans_features['token_type_ids'] = batch['token_type_ids'] - + trans_features = {"input_ids": batch["input_ids"], "attention_mask": batch["attention_mask"]} + if "token_type_ids" in batch: + trans_features["token_type_ids"] = batch["token_type_ids"] + output_states = self.auto_model(**trans_features, return_dict=False) output_tokens = output_states[0] - - batch.update({'token_embeddings': output_tokens, 'attention_mask': batch['attention_mask']}) - + + batch.update({"token_embeddings": output_tokens, "attention_mask": batch["attention_mask"]}) + if self.auto_model.config.output_hidden_states: all_layer_idx = 2 - if len(output_states) < 3: #Some models only output last_hidden_states and all_hidden_states + if len(output_states) < 3: # Some models only output last_hidden_states and all_hidden_states all_layer_idx = 1 - + hidden_states = output_states[all_layer_idx] - batch.update({'all_layer_embeddings': hidden_states}) - - return batch + batch.update({"all_layer_embeddings": hidden_states}) + return batch From 219042aee524c652c4ad73e0dbf2e13d780707d0 Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Sun, 5 Dec 2021 22:23:51 +0530 Subject: [PATCH 06/23] Update __init__.py --- flash/text/embeddings/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/flash/text/embeddings/__init__.py b/flash/text/embeddings/__init__.py index c49d81d2ae..5232963676 100644 --- a/flash/text/embeddings/__init__.py +++ b/flash/text/embeddings/__init__.py @@ -1 +1,2 @@ from flash.text.embeddings.model import SentenceEmbedder +from flash.text.embeddings.data import TextInput,TextCSVInput,TextJSONInput,TextDataFrameInput,TextParquetInput,TextHuggingFaceDatasetInput,TextListInput From 4b3c772b7e905e35d2dd1d4c78359ab5b139fcf3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 5 Dec 2021 16:54:39 +0000 Subject: [PATCH 07/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- flash/text/embeddings/__init__.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/flash/text/embeddings/__init__.py b/flash/text/embeddings/__init__.py index 5232963676..3993b0a526 100644 --- a/flash/text/embeddings/__init__.py +++ b/flash/text/embeddings/__init__.py @@ -1,2 +1,10 @@ +from flash.text.embeddings.data import ( + TextCSVInput, + TextDataFrameInput, + TextHuggingFaceDatasetInput, + TextInput, + TextJSONInput, + TextListInput, + TextParquetInput, +) from flash.text.embeddings.model import SentenceEmbedder -from flash.text.embeddings.data import TextInput,TextCSVInput,TextJSONInput,TextDataFrameInput,TextParquetInput,TextHuggingFaceDatasetInput,TextListInput From 06e35bdc664a103c13c0532fea681d700605c6a1 Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Wed, 8 Dec 2021 17:15:17 +0000 Subject: [PATCH 08/23] Updates --- flash/core/utilities/imports.py | 2 + flash/text/__init__.py | 1 + flash/text/embedding/__init__.py | 1 + flash/text/embedding/backbones.py | 14 ++ flash/text/embedding/model.py | 108 ++++++++++++++ flash/text/embeddings/__init__.py | 10 -- flash/text/embeddings/backbones.py | 24 ---- flash/text/embeddings/data.py | 217 ----------------------------- flash/text/embeddings/model.py | 211 ---------------------------- flash_examples/text_embedder.py | 37 +++++ requirements/datatype_text.txt | 1 + 11 files changed, 164 insertions(+), 462 deletions(-) create mode 100644 flash/text/embedding/__init__.py create mode 100644 flash/text/embedding/backbones.py create mode 100644 flash/text/embedding/model.py delete mode 100644 flash/text/embeddings/__init__.py delete mode 100644 flash/text/embeddings/backbones.py delete mode 100644 flash/text/embeddings/data.py delete mode 100644 flash/text/embeddings/model.py create mode 100644 flash_examples/text_embedder.py diff --git a/flash/core/utilities/imports.py b/flash/core/utilities/imports.py index 581c5cd719..082731f8d0 100644 --- a/flash/core/utilities/imports.py +++ b/flash/core/utilities/imports.py @@ -107,6 +107,7 @@ def _compare_version(package: str, op, version) -> bool: _ALBUMENTATIONS_AVAILABLE = _module_available("albumentations") _BAAL_AVAILABLE = _module_available("baal") _TORCH_OPTIMIZER_AVAILABLE = _module_available("torch_optimizer") +_SENTENCE_TRANSFORMERS_AVAILABLE = _module_available("sentence_transformers") if _PIL_AVAILABLE: @@ -130,6 +131,7 @@ class Image: _SENTENCEPIECE_AVAILABLE, _DATASETS_AVAILABLE, _TM_TEXT_AVAILABLE, + _SENTENCE_TRANSFORMERS_AVAILABLE, ] ) _TABULAR_AVAILABLE = _TABNET_AVAILABLE and _PANDAS_AVAILABLE and _FORECASTING_AVAILABLE diff --git a/flash/text/__init__.py b/flash/text/__init__.py index 7a17659b20..bf87427c9f 100644 --- a/flash/text/__init__.py +++ b/flash/text/__init__.py @@ -1,4 +1,5 @@ from flash.text.classification import TextClassificationData, TextClassifier # noqa: F401 +from flash.text.embedding import SentenceEmbedder # noqa: F401 from flash.text.question_answering import QuestionAnsweringData, QuestionAnsweringTask # noqa: F401 from flash.text.seq2seq import ( # noqa: F401 Seq2SeqData, diff --git a/flash/text/embedding/__init__.py b/flash/text/embedding/__init__.py new file mode 100644 index 0000000000..42bc49dd26 --- /dev/null +++ b/flash/text/embedding/__init__.py @@ -0,0 +1 @@ +from flash.text.embedding.model import SentenceEmbedder # noqa: F401 diff --git a/flash/text/embedding/backbones.py b/flash/text/embedding/backbones.py new file mode 100644 index 0000000000..c421e0179e --- /dev/null +++ b/flash/text/embedding/backbones.py @@ -0,0 +1,14 @@ +from flash.core.registry import ExternalRegistry, FlashRegistry +from flash.core.utilities.imports import _TEXT_AVAILABLE +from flash.core.utilities.providers import _HUGGINGFACE + +if _TEXT_AVAILABLE: + from transformers import AutoModel + + HUGGINGFACE_BACKBONES = ExternalRegistry( + AutoModel.from_pretrained, + "backbones", + _HUGGINGFACE, + ) +else: + HUGGINGFACE_BACKBONES = FlashRegistry("backbones") diff --git a/flash/text/embedding/model.py b/flash/text/embedding/model.py new file mode 100644 index 0000000000..c37a98d8b0 --- /dev/null +++ b/flash/text/embedding/model.py @@ -0,0 +1,108 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os +import warnings +from typing import Any, Dict, List, Optional + +import torch +from pytorch_lightning import Callback + +from flash.core.integrations.transformers.states import TransformersBackboneState +from flash.core.model import Task +from flash.core.registry import FlashRegistry, print_provider_info +from flash.core.utilities.imports import _TEXT_AVAILABLE +from flash.core.utilities.providers import _SENTENCE_TRANSFORMERS +from flash.text.embedding.backbones import HUGGINGFACE_BACKBONES +from flash.text.ort_callback import ORTCallback + +if _TEXT_AVAILABLE: + from sentence_transformers.models import Pooling + + Pooling = print_provider_info("Pooling", _SENTENCE_TRANSFORMERS, Pooling) + +logger = logging.getLogger(__name__) + + +class SentenceEmbedder(Task): + """The ``SentenceEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and + validation. For more details, see `embeddings`. + + You can change the backbone to any question answering model from `UKPLab/sentence-transformers + `_ using the ``backbone`` + argument. + + Args: + backbone: backbone model to use for the task. + enable_ort: Enable Torch ONNX Runtime Optimization: https://onnxruntime.ai/docs/#onnx-runtime-for-training + """ + + required_extras: str = "text" + + backbones: FlashRegistry = HUGGINGFACE_BACKBONES + + def __init__( + self, + backbone: str = "sentence-transformers/all-MiniLM-L6-v2", + tokenizer_backbone: Optional[str] = None, + tokenizer_kwargs: Optional[Dict[str, Any]] = None, + enable_ort: bool = False, + ): + os.environ["TOKENIZERS_PARALLELISM"] = "TRUE" + # disable HF thousand warnings + warnings.simplefilter("ignore") + # set os environ variable for multiprocesses + os.environ["PYTHONWARNINGS"] = "ignore" + super().__init__() + + if tokenizer_backbone is None: + tokenizer_backbone = backbone + self.set_state(TransformersBackboneState(tokenizer_backbone, tokenizer_kwargs=tokenizer_kwargs)) + self.model = self.backbones.get(backbone)() + self.pooling = Pooling(self.model.config.hidden_size) + self.enable_ort = enable_ort + + def training_step(self, batch: Any, batch_idx: int) -> Any: + raise NotImplementedError("Training a `SentenceEmbedder` is not supported. Use a different text task instead.") + + def validation_step(self, batch: Any, batch_idx: int) -> Any: + raise NotImplementedError( + "Validating a `SentenceEmbedder` is not supported. Use a different text task instead." + ) + + def test_step(self, batch: Any, batch_idx: int) -> Any: + raise NotImplementedError("Testing a `SentenceEmbedder` is not supported. Use a different text task instead.") + + def forward(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor: + """Adapted from sentence-transformers: + + https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/models/Transformer.py#L45 + """ + + trans_features = {"input_ids": batch["input_ids"], "attention_mask": batch["attention_mask"]} + if "token_type_ids" in batch: + trans_features["token_type_ids"] = batch["token_type_ids"] + + output_states = self.model(**trans_features, return_dict=False) + output_tokens = output_states[0] + + batch.update({"token_embeddings": output_tokens, "attention_mask": batch["attention_mask"]}) + + return self.pooling(batch)["sentence_embedding"] + + def configure_callbacks(self) -> List[Callback]: + callbacks = super().configure_callbacks() or [] + if self.enable_ort: + callbacks.append(ORTCallback()) + return callbacks diff --git a/flash/text/embeddings/__init__.py b/flash/text/embeddings/__init__.py deleted file mode 100644 index 3993b0a526..0000000000 --- a/flash/text/embeddings/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -from flash.text.embeddings.data import ( - TextCSVInput, - TextDataFrameInput, - TextHuggingFaceDatasetInput, - TextInput, - TextJSONInput, - TextListInput, - TextParquetInput, -) -from flash.text.embeddings.model import SentenceEmbedder diff --git a/flash/text/embeddings/backbones.py b/flash/text/embeddings/backbones.py deleted file mode 100644 index 6fbf4036f2..0000000000 --- a/flash/text/embeddings/backbones.py +++ /dev/null @@ -1,24 +0,0 @@ -from flash.core.registry import ExternalRegistry, FlashRegistry -from flash.core.utilities.imports import _TEXT_AVAILABLE -from flash.core.utilities.providers import _HUGGINGFACE - -SENTENCE_TRANSFORMERS_BACKBONE = FlashRegistry("backbones") - -if _TEXT_AVAILABLE: - from transformers import AutoConfig, AutoModel, AutoTokenizer - - AUTOMODEL_BACKBONE = ExternalRegistry( - AutoModel.from_pretrained, - "backbones", - _HUGGINGFACE, - ) - AUTOTOKENIZER_BACKBONE = ExternalRegistry( - AutoTokenizer.from_pretrained, - "backbones", - _HUGGINGFACE, - ) - AUTOCONFIG_BACKBONE = ExternalRegistry( - AutoConfig.from_pretrained, - "backbones", - _HUGGINGFACE, - ) diff --git a/flash/text/embeddings/data.py b/flash/text/embeddings/data.py deleted file mode 100644 index 1be078354d..0000000000 --- a/flash/text/embeddings/data.py +++ /dev/null @@ -1,217 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from functools import partial -from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union - -import torch -from pandas.core.frame import DataFrame - -import flash -from flash.core.data.auto_dataset import AutoDataset -from flash.core.data.io.input import DataKeys, Input, LabelsState -from flash.core.utilities.imports import _TEXT_AVAILABLE, requires - -if _TEXT_AVAILABLE: - from datasets import Dataset, load_dataset - from transformers import AutoTokenizer - - -class TextInput(Input): - @requires("text") - def __init__(self, backbone: str, max_length: int = 128): - super().__init__() - - self.backbone = backbone - self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=True) - self.max_length = max_length - - def _tokenize_fn( - self, - ex: Union[Dict[str, str], str], - input: Optional[str] = None, - ) -> Callable: - """This function is used to tokenize sentences using the provided tokenizer.""" - return self.tokenizer(ex[input], max_length=self.max_length, truncation=True, padding="max_length") - - @staticmethod - def _transform_label(label_to_class_mapping: Dict[str, int], target: str, ex: Dict[str, Union[int, str]]): - ex[target] = label_to_class_mapping[ex[target]] - return ex - - @staticmethod - def _multilabel_target(targets: List[str], element: Dict[str, Any]) -> Dict[str, Any]: - targets = [element.pop(target) for target in targets] - element[DataKeys.TARGET] = targets - return element - - def _to_hf_dataset(self, data) -> Sequence[Mapping[str, Any]]: - """account for flash CI testing context.""" - hf_dataset, *other = self.to_hf_dataset(data) - - if flash._IS_TESTING and not torch.cuda.is_available(): - # NOTE: must subset in this way to return a Dataset - hf_dataset = hf_dataset.select(range(20)) - - return (hf_dataset, *other) - - def load_data( - self, - data: Tuple[str, Union[str, List[str]], Union[str, List[str]]], - dataset: Optional[Any] = None, - ) -> Sequence[Mapping[str, Any]]: - """Loads data into HuggingFace datasets.Dataset.""" - - hf_dataset, input, *other = self._to_hf_dataset(data) - - if not self.predicting: - target: Union[str, List[str]] = other.pop() - if isinstance(target, List): - # multi-target - dataset.multi_label = True - hf_dataset = hf_dataset.map(partial(self._multilabel_target, target)) # NOTE: renames target column - dataset.num_classes = len(target) - self.set_state(LabelsState(target)) - else: - dataset.multi_label = False - if self.training: - labels = list(sorted(list(set(hf_dataset[target])))) - dataset.num_classes = len(labels) - self.set_state(LabelsState(labels)) - - labels = self.get_state(LabelsState) - - # convert labels to ids (note: the target column get overwritten) - if labels is not None: - labels = labels.labels - label_to_class_mapping = {v: k for k, v in enumerate(labels)} - hf_dataset = hf_dataset.map(partial(self._transform_label, label_to_class_mapping, target)) - - # rename label column - hf_dataset = hf_dataset.rename_column(target, DataKeys.TARGET) - - # remove extra columns - extra_columns = set(hf_dataset.column_names) - {input, DataKeys.TARGET} - hf_dataset = hf_dataset.remove_columns(extra_columns) - - # tokenize - hf_dataset = hf_dataset.map(partial(self._tokenize_fn, input=input), batched=True, remove_columns=[input]) - - # set format - hf_dataset.set_format("torch") - - return hf_dataset - - def predict_load_data(self, data: Any, dataset: AutoDataset): - return self.load_data(data, dataset) - - def __getstate__(self): # TODO: Find out why this is being pickled - state = self.__dict__.copy() - state.pop("tokenizer") - return state - - def __setstate__(self, state): - self.__dict__.update(state) - self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) - - -class TextCSVInput(TextInput): - def to_hf_dataset(self, data: Tuple[str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]: - file, *other = data - dataset_dict = load_dataset("csv", data_files={"train": str(file)}) - return (dataset_dict["train"], *other) - - -class TextJSONInput(TextInput): - def to_hf_dataset(self, data: Tuple[str, str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]: - file, *other, field = data - dataset_dict = load_dataset("json", data_files={"train": str(file)}, field=field) - return (dataset_dict["train"], *other) - - -class TextDataFrameInput(TextInput): - def to_hf_dataset(self, data: Tuple[DataFrame, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]: - df, *other = data - hf_dataset = Dataset.from_pandas(df) - return (hf_dataset, *other) - - -class TextParquetInput(TextInput): - def to_hf_dataset(self, data: Tuple[str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]: - file, *other = data - hf_dataset = Dataset.from_parquet(str(file)) - return (hf_dataset, *other) - - -class TextHuggingFaceDatasetInput(TextInput): - def to_hf_dataset(self, data: Tuple[str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]: - hf_dataset, *other = data - return (hf_dataset, *other) - - -class TextListInput(TextInput): - def to_hf_dataset( - self, data: Union[Tuple[List[str], List[str]], List[str]] - ) -> Tuple[Sequence[Mapping[str, Any]], Optional[List[str]]]: - - if isinstance(data, tuple): - input_list, target_list = data - # NOTE: here we already deal with multilabels - # NOTE: here we already rename to correct column names - hf_dataset = Dataset.from_dict({DataKeys.INPUT: input_list, DataKeys.TARGET: target_list}) - return hf_dataset, target_list - - # predicting - hf_dataset = Dataset.from_dict({DataKeys.INPUT: data}) - - return (hf_dataset,) - - def load_data( - self, - data: Tuple[List[str], Union[List[Any], List[List[Any]]]], - dataset: Optional[Any] = None, - ) -> Sequence[Mapping[str, Any]]: - - hf_dataset, *other = self._to_hf_dataset(data) - - if not self.predicting: - target_list = other.pop() - if isinstance(target_list[0], List): - # multi-target_list - dataset.multi_label = True - dataset.num_classes = len(target_list[0]) - self.set_state(LabelsState(target_list)) - else: - dataset.multi_label = False - if self.training: - labels = list(sorted(list(set(hf_dataset[DataKeys.TARGET])))) - dataset.num_classes = len(labels) - self.set_state(LabelsState(labels)) - - labels = self.get_state(LabelsState) - - # convert labels to ids - if labels is not None: - labels = labels.labels - label_to_class_mapping = {v: k for k, v in enumerate(labels)} - # happens in-place and keeps the target column name - hf_dataset = hf_dataset.map(partial(self._transform_label, label_to_class_mapping, DataKeys.TARGET)) - - # tokenize - hf_dataset = hf_dataset.map(partial(self._tokenize_fn, input=DataKeys.INPUT), batched=True) - - # set format - hf_dataset = hf_dataset.remove_columns([DataKeys.INPUT]) # just leave the numerical columns - hf_dataset.set_format("torch") - - return hf_dataset diff --git a/flash/text/embeddings/model.py b/flash/text/embeddings/model.py deleted file mode 100644 index f1b2165fae..0000000000 --- a/flash/text/embeddings/model.py +++ /dev/null @@ -1,211 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import logging -import os -import warnings -from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union - -import numpy as np -import torch -import torchmetrics -from pytorch_lightning import Callback -from pytorch_lightning.utilities import rank_zero_info -from torch import nn, Tensor -from torch.optim.lr_scheduler import _LRScheduler -from torchmetrics import Metric -from tqdm.autonotebook import trange - -from flash.core.data.data_source import DefaultDataKeys -from flash.core.data.process import Serializer -from flash.core.finetuning import FlashBaseFinetuning -from flash.core.model import Task -from flash.core.registry import FlashRegistry -from flash.text.embeddings.backbones import AUTOCONFIG_BACKBONE, AUTOMODEL_BACKBONE, AUTOTOKENIZER_BACKBONE - -logger = logging.getLogger(__name__) - - -class SentenceEmbedder(Task): - """The ``SentenceEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and - validation. For more details, see `embeddings`. - - You can change the backbone to any question answering model from `UKPLab/sentence-transformers - `_ using the ``backbone`` - argument. - - .. note:: When changing the backbone, make sure you pass in the same backbone to the :class:`~flash.Task` and the - :class:`~flash.core.data.data_module.DataModule` object! Since this is a Sentence Transformers task, make sure you - use a Sentence Transformers model. - - Args: - backbone: backbone model to use for the task. - loss_fn: Loss function for training. - optimizer: Optimizer to use for training, defaults to `torch.optim.Adam`. - optimizer_kwargs: Additional kwargs to use when creating the optimizer (if not passed as an instance). - scheduler: The scheduler or scheduler class to use. - scheduler_kwargs: Additional kwargs to use when creating the scheduler (if not passed as an instance). - metrics: Metrics to compute for training and evaluation. Defauls to calculating the ROUGE metric. - Changing this argument currently has no effect. - learning_rate: Learning rate to use for training, defaults to `3e-4` - enable_ort: Enable Torch ONNX Runtime Optimization: https://onnxruntime.ai/docs/#onnx-runtime-for-training - """ - - required_extras: str = "text" - - AutoModel_Backbones: FlashRegistry = AUTOMODEL_BACKBONE - AutoTokenizer_Backbones: FlashRegistry = AUTOTOKENIZER_BACKBONE - AutoConfig_Backbones: FlashRegistry = AUTOCONFIG_BACKBONE - - def __init__( - self, - model_backbone: str = "sentence-transformers/all-MiniLM-L6-v2", - tokenizer_backbone: str = "sentence-transformers/all-MiniLM-L6-v2", - enable_ort: bool = False, - ): - os.environ["TOKENIZERS_PARALLELISM"] = "TRUE" - # disable HF thousand warnings - warnings.simplefilter("ignore") - # set os environ variable for multiprocesses - os.environ["PYTHONWARNINGS"] = "ignore" - super().__init__() - - self.config = self.AutoConfig_Backbones.get(model_backbone) - self.auto_model = self.AutoModel_Backbones.get(model_backbone) - self.tokenzier = self.AutoTokenizer_Backbones.get(tokenizer_backbone) - - if tokenizer_backbone is not None: - self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__ - - def predict_step( - self, - sentences: Union[str, List[str]], - batch_size: int = 32, - show_progress_bar: bool = None, - output_value: str = "sentence_embedding", - convert_to_numpy: bool = True, - convert_to_tensor: bool = False, - device: str = None, - normalize_embeddings: bool = False, - ) -> Union[List[Tensor], np.ndarray, Tensor]: - """Computes sentence embeddings. - - :param sentences: the sentences to embed - :param batch_size: the batch size used for the computation - :param show_progress_bar: Output a progress bar when encode sentences - :param output_value: Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values - :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors. - :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy - :param device: Which torch.device to use for the computation - :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used. - :return: - By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned. - """ - self.eval() - if show_progress_bar is None: - show_progress_bar = ( - logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG - ) - - if convert_to_tensor: - convert_to_numpy = False - - if output_value != "sentence_embedding": - convert_to_tensor = False - convert_to_numpy = False - - input_was_string = False - if isinstance(sentences, str) or not hasattr( - sentences, "__len__" - ): # Cast an individual sentence to a list with length 1 - sentences = [sentences] - input_was_string = True - - if device is None: - device = self._target_device - - self.to(device) - - all_embeddings = [] - length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences]) - sentences_sorted = [sentences[idx] for idx in length_sorted_idx] - - for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar): - sentences_batch = sentences_sorted[start_index : start_index + batch_size] - features = self.tokenize(sentences_batch) - - with torch.no_grad(): - out_features = self.forward(features) - - if output_value == "token_embeddings": - embeddings = [] - for token_emb, attention in zip(out_features[output_value], out_features["attention_mask"]): - last_mask_id = len(attention) - 1 - while last_mask_id > 0 and attention[last_mask_id].item() == 0: - last_mask_id -= 1 - - embeddings.append(token_emb[0 : last_mask_id + 1]) - elif output_value is None: # Return all outputs - embeddings = [] - for sent_idx in range(len(out_features["sentence_embedding"])): - row = {name: out_features[name][sent_idx] for name in out_features} - embeddings.append(row) - else: # Sentence embeddings - embeddings = out_features[output_value] - embeddings = embeddings.detach() - if normalize_embeddings: - embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) - - # fixes for #522 and #487 to avoid oom problems on gpu with large datasets - if convert_to_numpy: - embeddings = embeddings.cpu() - - all_embeddings.extend(embeddings) - - all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)] - - if convert_to_tensor: - all_embeddings = torch.stack(all_embeddings) - elif convert_to_numpy: - all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings]) - - if input_was_string: - all_embeddings = all_embeddings[0] - - return all_embeddings - - @property - def backbone(self): - return self.model.base_model - - def forward(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor: - """First call the backbone, then the model head.""" - - trans_features = {"input_ids": batch["input_ids"], "attention_mask": batch["attention_mask"]} - if "token_type_ids" in batch: - trans_features["token_type_ids"] = batch["token_type_ids"] - - output_states = self.auto_model(**trans_features, return_dict=False) - output_tokens = output_states[0] - - batch.update({"token_embeddings": output_tokens, "attention_mask": batch["attention_mask"]}) - - if self.auto_model.config.output_hidden_states: - all_layer_idx = 2 - if len(output_states) < 3: # Some models only output last_hidden_states and all_hidden_states - all_layer_idx = 1 - - hidden_states = output_states[all_layer_idx] - batch.update({"all_layer_embeddings": hidden_states}) - - return batch diff --git a/flash_examples/text_embedder.py b/flash_examples/text_embedder.py new file mode 100644 index 0000000000..10744c71e5 --- /dev/null +++ b/flash_examples/text_embedder.py @@ -0,0 +1,37 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import torch + +import flash +from flash.core.data.utils import download_data +from flash.text import SentenceEmbedder, TextClassificationData + +# 1. Create the DataModule +download_data("https://pl-flash-data.s3.amazonaws.com/xsum.zip", "./data/") + +datamodule = TextClassificationData.from_lists( + predict_data=[ + "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.", + "The worst movie in the history of cinema.", + "I come from Bulgaria where it 's almost impossible to have a tornado.", + ] +) + +# 2. Load a previously trained SentenceEmbedder +model = SentenceEmbedder(backbone="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") + +# 3. Generate embeddings for the first 3 graphs +trainer = flash.Trainer(gpus=torch.cuda.device_count()) +predictions = trainer.predict(model, datamodule=datamodule) +print(predictions) diff --git a/requirements/datatype_text.txt b/requirements/datatype_text.txt index aba24a7ef5..4173bcbd78 100644 --- a/requirements/datatype_text.txt +++ b/requirements/datatype_text.txt @@ -3,3 +3,4 @@ filelock transformers>=4.5 torchmetrics[text]>=0.5.1 datasets>=1.8,<1.13 +sentence-transformers From 2071c9f0f6bcc1e68c242ab04107f291cb92d22b Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Thu, 9 Dec 2021 00:32:34 +0530 Subject: [PATCH 09/23] Create test_model.py --- tests/text/embedding/test_model.py | 47 ++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 tests/text/embedding/test_model.py diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py new file mode 100644 index 0000000000..730d89deaa --- /dev/null +++ b/tests/text/embedding/test_model.py @@ -0,0 +1,47 @@ + +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os + +import pytest +import torch + + +import flash +from flash.text import SentenceEmbedder, TextClassificationData +from tests.helpers.utils import _TEXT_TESTING + +# ======== Mock functions ======== + +datamodule = TextClassificationData.from_lists( + predict_data=[ + "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.", + "The worst movie in the history of cinema.", + "I come from Bulgaria where it 's almost impossible to have a tornado.", + ] + ) + +# ============================== + +TEST_BACKBONE = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" # super small model for testing +model = SentenceEmbedder(backbone=TEST_BACKBONE) + +@pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows") +@pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed.") +def test_predict(tmpdir): + trainer = flash.Trainer(gpus=torch.cuda.device_count()) + predictions = trainer.predict(model, datamodule=datamodule) + assert[t.size() for t in predictions[0]]==[torch.Size([384]), torch.Size([384]), torch.Size([384])] + + From 5477415483de6b4014e7ce9dc84c9720f159af70 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Dec 2021 19:03:24 +0000 Subject: [PATCH 10/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/text/embedding/test_model.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py index 730d89deaa..b3506bc521 100644 --- a/tests/text/embedding/test_model.py +++ b/tests/text/embedding/test_model.py @@ -1,4 +1,3 @@ - # Copyright The PyTorch Lightning team. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -17,31 +16,29 @@ import pytest import torch - import flash from flash.text import SentenceEmbedder, TextClassificationData -from tests.helpers.utils import _TEXT_TESTING +from tests.helpers.utils import _TEXT_TESTING # ======== Mock functions ======== datamodule = TextClassificationData.from_lists( - predict_data=[ - "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.", - "The worst movie in the history of cinema.", - "I come from Bulgaria where it 's almost impossible to have a tornado.", - ] - ) + predict_data=[ + "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.", + "The worst movie in the history of cinema.", + "I come from Bulgaria where it 's almost impossible to have a tornado.", + ] +) # ============================== TEST_BACKBONE = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" # super small model for testing model = SentenceEmbedder(backbone=TEST_BACKBONE) + @pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows") @pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed.") def test_predict(tmpdir): trainer = flash.Trainer(gpus=torch.cuda.device_count()) predictions = trainer.predict(model, datamodule=datamodule) - assert[t.size() for t in predictions[0]]==[torch.Size([384]), torch.Size([384]), torch.Size([384])] - - + assert [t.size() for t in predictions[0]] == [torch.Size([384]), torch.Size([384]), torch.Size([384])] From 8db110d7797f8692dd3415b7dcd2214f8df8cb46 Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Thu, 9 Dec 2021 00:47:27 +0530 Subject: [PATCH 11/23] __init__ for embedding --- tests/text/embedding/__init__.py | 0 tests/text/embedding/test_model.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 tests/text/embedding/__init__.py diff --git a/tests/text/embedding/__init__.py b/tests/text/embedding/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py index b3506bc521..806b2a0447 100644 --- a/tests/text/embedding/test_model.py +++ b/tests/text/embedding/test_model.py @@ -20,7 +20,7 @@ from flash.text import SentenceEmbedder, TextClassificationData from tests.helpers.utils import _TEXT_TESTING -# ======== Mock functions ======== +# ======== Mock data ======== datamodule = TextClassificationData.from_lists( predict_data=[ From a6bfc9f3431b8cc9305ce24bb591fa84f3efc352 Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Thu, 9 Dec 2021 01:29:08 +0530 Subject: [PATCH 12/23] remove download_data() --- flash_examples/text_embedder.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/flash_examples/text_embedder.py b/flash_examples/text_embedder.py index 10744c71e5..9aa0eabbdc 100644 --- a/flash_examples/text_embedder.py +++ b/flash_examples/text_embedder.py @@ -18,8 +18,6 @@ from flash.text import SentenceEmbedder, TextClassificationData # 1. Create the DataModule -download_data("https://pl-flash-data.s3.amazonaws.com/xsum.zip", "./data/") - datamodule = TextClassificationData.from_lists( predict_data=[ "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.", From 21305d6a9622a85acae9041b1acd6bb2263d3fc9 Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Thu, 9 Dec 2021 19:18:35 +0530 Subject: [PATCH 13/23] lower size model for text embededer examples and test --- flash_examples/text_embedder.py | 3 ++- tests/text/embedding/test_model.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/flash_examples/text_embedder.py b/flash_examples/text_embedder.py index 9aa0eabbdc..b6e99c0742 100644 --- a/flash_examples/text_embedder.py +++ b/flash_examples/text_embedder.py @@ -27,9 +27,10 @@ ) # 2. Load a previously trained SentenceEmbedder -model = SentenceEmbedder(backbone="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") +model = SentenceEmbedder(backbone="sentence-transformers/all-MiniLM-L6-v2") # 3. Generate embeddings for the first 3 graphs trainer = flash.Trainer(gpus=torch.cuda.device_count()) predictions = trainer.predict(model, datamodule=datamodule) print(predictions) + diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py index 806b2a0447..c1c433cf03 100644 --- a/tests/text/embedding/test_model.py +++ b/tests/text/embedding/test_model.py @@ -32,7 +32,7 @@ # ============================== -TEST_BACKBONE = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" # super small model for testing +TEST_BACKBONE = "sentence-transformers/all-MiniLM-L6-v2" # super small model for testing model = SentenceEmbedder(backbone=TEST_BACKBONE) From 5d1b4c66ef42b04a0ea129546f6715f36a7853f4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Dec 2021 13:49:18 +0000 Subject: [PATCH 14/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- flash_examples/text_embedder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/flash_examples/text_embedder.py b/flash_examples/text_embedder.py index b6e99c0742..1a19bd646c 100644 --- a/flash_examples/text_embedder.py +++ b/flash_examples/text_embedder.py @@ -33,4 +33,3 @@ trainer = flash.Trainer(gpus=torch.cuda.device_count()) predictions = trainer.predict(model, datamodule=datamodule) print(predictions) - From 9570522e89e415851cbd32e2207f8ad1c25597b1 Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Thu, 9 Dec 2021 19:28:00 +0530 Subject: [PATCH 15/23] text embedder example entry to CI --- requirements.txt | 1 - tests/examples/test_scripts.py | 4 ++++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 333125ff76..0920edcb4c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,3 @@ pyDeprecate pandas>=1.1.0 jsonargparse[signatures]>=3.17.0 click>=7.1.2 -tqdm diff --git a/tests/examples/test_scripts.py b/tests/examples/test_scripts.py index 033ee35b3d..6ee35fef73 100644 --- a/tests/examples/test_scripts.py +++ b/tests/examples/test_scripts.py @@ -80,6 +80,10 @@ "text_classification.py", marks=pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed"), ), + pytest.param( + "text_embedder.py", + marks=pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed"), + ), # pytest.param( # "text_classification_multi_label.py", # marks=pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed") From bb98d77986286d68c90a917885a0e1e3311796b6 Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Thu, 9 Dec 2021 19:35:00 +0530 Subject: [PATCH 16/23] change `SentenceEmbedder` to `TextEmbedder` --- flash/text/__init__.py | 2 +- flash/text/embedding/__init__.py | 2 +- flash/text/embedding/model.py | 10 +++++----- flash_examples/text_embedder.py | 6 +++--- tests/text/embedding/test_model.py | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/flash/text/__init__.py b/flash/text/__init__.py index bf87427c9f..63400dcd9c 100644 --- a/flash/text/__init__.py +++ b/flash/text/__init__.py @@ -1,5 +1,5 @@ from flash.text.classification import TextClassificationData, TextClassifier # noqa: F401 -from flash.text.embedding import SentenceEmbedder # noqa: F401 +from flash.text.embedding import TextEmbedder # noqa: F401 from flash.text.question_answering import QuestionAnsweringData, QuestionAnsweringTask # noqa: F401 from flash.text.seq2seq import ( # noqa: F401 Seq2SeqData, diff --git a/flash/text/embedding/__init__.py b/flash/text/embedding/__init__.py index 42bc49dd26..ed171439f7 100644 --- a/flash/text/embedding/__init__.py +++ b/flash/text/embedding/__init__.py @@ -1 +1 @@ -from flash.text.embedding.model import SentenceEmbedder # noqa: F401 +from flash.text.embedding.model import TextEmbedder # noqa: F401 diff --git a/flash/text/embedding/model.py b/flash/text/embedding/model.py index c37a98d8b0..fdec7b5f77 100644 --- a/flash/text/embedding/model.py +++ b/flash/text/embedding/model.py @@ -35,8 +35,8 @@ logger = logging.getLogger(__name__) -class SentenceEmbedder(Task): - """The ``SentenceEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and +class TextEmbedder(Task): + """The ``TextEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and validation. For more details, see `embeddings`. You can change the backbone to any question answering model from `UKPLab/sentence-transformers @@ -74,15 +74,15 @@ def __init__( self.enable_ort = enable_ort def training_step(self, batch: Any, batch_idx: int) -> Any: - raise NotImplementedError("Training a `SentenceEmbedder` is not supported. Use a different text task instead.") + raise NotImplementedError("Training a `TextEmbedder` is not supported. Use a different text task instead.") def validation_step(self, batch: Any, batch_idx: int) -> Any: raise NotImplementedError( - "Validating a `SentenceEmbedder` is not supported. Use a different text task instead." + "Validating a `TextEmbedder` is not supported. Use a different text task instead." ) def test_step(self, batch: Any, batch_idx: int) -> Any: - raise NotImplementedError("Testing a `SentenceEmbedder` is not supported. Use a different text task instead.") + raise NotImplementedError("Testing a `TextEmbedder` is not supported. Use a different text task instead.") def forward(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor: """Adapted from sentence-transformers: diff --git a/flash_examples/text_embedder.py b/flash_examples/text_embedder.py index 1a19bd646c..8b69e2f262 100644 --- a/flash_examples/text_embedder.py +++ b/flash_examples/text_embedder.py @@ -15,7 +15,7 @@ import flash from flash.core.data.utils import download_data -from flash.text import SentenceEmbedder, TextClassificationData +from flash.text import TextEmbedder, TextClassificationData # 1. Create the DataModule datamodule = TextClassificationData.from_lists( @@ -26,8 +26,8 @@ ] ) -# 2. Load a previously trained SentenceEmbedder -model = SentenceEmbedder(backbone="sentence-transformers/all-MiniLM-L6-v2") +# 2. Load a previously trained TextEmbedder +model = TextEmbedder(backbone="sentence-transformers/all-MiniLM-L6-v2") # 3. Generate embeddings for the first 3 graphs trainer = flash.Trainer(gpus=torch.cuda.device_count()) diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py index c1c433cf03..ad09fb9544 100644 --- a/tests/text/embedding/test_model.py +++ b/tests/text/embedding/test_model.py @@ -17,7 +17,7 @@ import torch import flash -from flash.text import SentenceEmbedder, TextClassificationData +from flash.text import TextEmbedder, TextClassificationData from tests.helpers.utils import _TEXT_TESTING # ======== Mock data ======== @@ -33,7 +33,7 @@ # ============================== TEST_BACKBONE = "sentence-transformers/all-MiniLM-L6-v2" # super small model for testing -model = SentenceEmbedder(backbone=TEST_BACKBONE) +model = TextEmbedder(backbone=TEST_BACKBONE) @pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows") From 923e6ec513884d825ba85ecacbf32fa699630b26 Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Thu, 9 Dec 2021 19:36:01 +0530 Subject: [PATCH 17/23] remove `download_data` import --- flash_examples/text_embedder.py | 1 - 1 file changed, 1 deletion(-) diff --git a/flash_examples/text_embedder.py b/flash_examples/text_embedder.py index 8b69e2f262..5eeae9a0f0 100644 --- a/flash_examples/text_embedder.py +++ b/flash_examples/text_embedder.py @@ -14,7 +14,6 @@ import torch import flash -from flash.core.data.utils import download_data from flash.text import TextEmbedder, TextClassificationData # 1. Create the DataModule From 8c90286be68e0394de847398818788031e53a9ac Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Dec 2021 14:07:10 +0000 Subject: [PATCH 18/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- flash/text/embedding/model.py | 8 +++----- flash_examples/text_embedder.py | 2 +- tests/text/embedding/test_model.py | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/flash/text/embedding/model.py b/flash/text/embedding/model.py index fdec7b5f77..2fae923403 100644 --- a/flash/text/embedding/model.py +++ b/flash/text/embedding/model.py @@ -36,8 +36,8 @@ class TextEmbedder(Task): - """The ``TextEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and - validation. For more details, see `embeddings`. + """The ``TextEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and validation. + For more details, see `embeddings`. You can change the backbone to any question answering model from `UKPLab/sentence-transformers `_ using the ``backbone`` @@ -77,9 +77,7 @@ def training_step(self, batch: Any, batch_idx: int) -> Any: raise NotImplementedError("Training a `TextEmbedder` is not supported. Use a different text task instead.") def validation_step(self, batch: Any, batch_idx: int) -> Any: - raise NotImplementedError( - "Validating a `TextEmbedder` is not supported. Use a different text task instead." - ) + raise NotImplementedError("Validating a `TextEmbedder` is not supported. Use a different text task instead.") def test_step(self, batch: Any, batch_idx: int) -> Any: raise NotImplementedError("Testing a `TextEmbedder` is not supported. Use a different text task instead.") diff --git a/flash_examples/text_embedder.py b/flash_examples/text_embedder.py index 5eeae9a0f0..f613f0def8 100644 --- a/flash_examples/text_embedder.py +++ b/flash_examples/text_embedder.py @@ -14,7 +14,7 @@ import torch import flash -from flash.text import TextEmbedder, TextClassificationData +from flash.text import TextClassificationData, TextEmbedder # 1. Create the DataModule datamodule = TextClassificationData.from_lists( diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py index ad09fb9544..64f4404e0b 100644 --- a/tests/text/embedding/test_model.py +++ b/tests/text/embedding/test_model.py @@ -17,7 +17,7 @@ import torch import flash -from flash.text import TextEmbedder, TextClassificationData +from flash.text import TextClassificationData, TextEmbedder from tests.helpers.utils import _TEXT_TESTING # ======== Mock data ======== From 20233f2907f22918732184e25869bac5338ca714 Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Thu, 9 Dec 2021 19:46:42 +0530 Subject: [PATCH 19/23] fix bug - test_model.py --- tests/text/embedding/test_model.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py index ad09fb9544..f293eb184b 100644 --- a/tests/text/embedding/test_model.py +++ b/tests/text/embedding/test_model.py @@ -22,14 +22,11 @@ # ======== Mock data ======== -datamodule = TextClassificationData.from_lists( - predict_data=[ - "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.", - "The worst movie in the history of cinema.", - "I come from Bulgaria where it 's almost impossible to have a tornado.", - ] -) - +predict_data=[ + "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.", + "The worst movie in the history of cinema.", + "I come from Bulgaria where it 's almost impossible to have a tornado.", + ] # ============================== TEST_BACKBONE = "sentence-transformers/all-MiniLM-L6-v2" # super small model for testing @@ -39,6 +36,9 @@ @pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows") @pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed.") def test_predict(tmpdir): + datamodule = TextClassificationData.from_lists( + predict_data=predict_data + ) trainer = flash.Trainer(gpus=torch.cuda.device_count()) predictions = trainer.predict(model, datamodule=datamodule) assert [t.size() for t in predictions[0]] == [torch.Size([384]), torch.Size([384]), torch.Size([384])] From 57aa577d258f8afb76b85ba01b9682fa2850fe4c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Dec 2021 14:17:27 +0000 Subject: [PATCH 20/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/text/embedding/test_model.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py index 468e5c59f8..389f384a12 100644 --- a/tests/text/embedding/test_model.py +++ b/tests/text/embedding/test_model.py @@ -22,11 +22,11 @@ # ======== Mock data ======== -predict_data=[ - "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.", - "The worst movie in the history of cinema.", - "I come from Bulgaria where it 's almost impossible to have a tornado.", - ] +predict_data = [ + "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.", + "The worst movie in the history of cinema.", + "I come from Bulgaria where it 's almost impossible to have a tornado.", +] # ============================== TEST_BACKBONE = "sentence-transformers/all-MiniLM-L6-v2" # super small model for testing @@ -36,9 +36,7 @@ @pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows") @pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed.") def test_predict(tmpdir): - datamodule = TextClassificationData.from_lists( - predict_data=predict_data - ) + datamodule = TextClassificationData.from_lists(predict_data=predict_data) trainer = flash.Trainer(gpus=torch.cuda.device_count()) predictions = trainer.predict(model, datamodule=datamodule) assert [t.size() for t in predictions[0]] == [torch.Size([384]), torch.Size([384]), torch.Size([384])] From fdbb2de77112ec40858fc66e68c0ca7c6063923e Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Thu, 9 Dec 2021 19:54:15 +0530 Subject: [PATCH 21/23] Update test_model.py --- tests/text/embedding/test_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py index 468e5c59f8..758057c5a6 100644 --- a/tests/text/embedding/test_model.py +++ b/tests/text/embedding/test_model.py @@ -30,7 +30,6 @@ # ============================== TEST_BACKBONE = "sentence-transformers/all-MiniLM-L6-v2" # super small model for testing -model = TextEmbedder(backbone=TEST_BACKBONE) @pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows") @@ -39,6 +38,8 @@ def test_predict(tmpdir): datamodule = TextClassificationData.from_lists( predict_data=predict_data ) + model = TextEmbedder(backbone=TEST_BACKBONE) + trainer = flash.Trainer(gpus=torch.cuda.device_count()) predictions = trainer.predict(model, datamodule=datamodule) assert [t.size() for t in predictions[0]] == [torch.Size([384]), torch.Size([384]), torch.Size([384])] From 14a5e27ee12cc7101be4b39525f4297224677bd9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Dec 2021 14:26:00 +0000 Subject: [PATCH 22/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/text/embedding/test_model.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py index 827cbb2fe9..0a712b3b3a 100644 --- a/tests/text/embedding/test_model.py +++ b/tests/text/embedding/test_model.py @@ -35,9 +35,7 @@ @pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows") @pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed.") def test_predict(tmpdir): - datamodule = TextClassificationData.from_lists( - predict_data=predict_data - ) + datamodule = TextClassificationData.from_lists(predict_data=predict_data) model = TextEmbedder(backbone=TEST_BACKBONE) trainer = flash.Trainer(gpus=torch.cuda.device_count()) From 3d14659926e4afb9e09aceba86b6dcb3a4205892 Mon Sep 17 00:00:00 2001 From: Abhijith Neil Abraham Date: Thu, 9 Dec 2021 20:19:50 +0530 Subject: [PATCH 23/23] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 94aff3f61d..89d200eeff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [Unreleased] - YYYY-DD-MM ### Added +- Added `TextEmbedder` task ([#996](https://github.com/PyTorchLightning/lightning-flash/pull/996)) - Added predict_kwargs in `ObjectDetector`, `InstanceSegmentation`, `KeypointDetector` ([#990](https://github.com/PyTorchLightning/lightning-flash/pull/990))