From 98ed529a678bd300f67af85e5c618753b8150a41 Mon Sep 17 00:00:00 2001
From: Abhijith Neil Abraham <abhijithneilabrahampk@gmail.com>
Date: Wed, 24 Nov 2021 21:06:10 +0530
Subject: [PATCH 01/23] Sentence Embedder API using sentence transformers

---
 flash/core/utilities/providers.py  |   1 +
 flash/text/embeddings/__init__.py  |   1 +
 flash/text/embeddings/backbones.py |  14 +++
 flash/text/embeddings/data.py      |   1 +
 flash/text/embeddings/model.py     | 137 +++++++++++++++++++++++++++++
 5 files changed, 154 insertions(+)
 create mode 100644 flash/text/embeddings/__init__.py
 create mode 100644 flash/text/embeddings/backbones.py
 create mode 100644 flash/text/embeddings/data.py
 create mode 100644 flash/text/embeddings/model.py

diff --git a/flash/core/utilities/providers.py b/flash/core/utilities/providers.py
index 4c2af721a9..422b019992 100644
--- a/flash/core/utilities/providers.py
+++ b/flash/core/utilities/providers.py
@@ -42,6 +42,7 @@ def __str__(self):
 _LEARN2LEARN = Provider("learnables/learn2learn", "https://github.com/learnables/learn2learn")
 _PYSTICHE = Provider("pystiche/pystiche", "https://github.com/pystiche/pystiche")
 _HUGGINGFACE = Provider("Hugging Face/transformers", "https://github.com/huggingface/transformers")
+_SENTENCE_TRANSFORMERS = Provider("UKPLab/sentence-transformers", "https://github.com/UKPLab/sentence-transformers")
 _FAIRSEQ = Provider("PyTorch/fairseq", "https://github.com/pytorch/fairseq")
 _OPEN3D_ML = Provider("Intelligent Systems Lab Org/Open3D-ML", "https://github.com/isl-org/Open3D-ML")
 _PYTORCHVIDEO = Provider("Facebook Research/PyTorchVideo", "https://github.com/facebookresearch/pytorchvideo")
diff --git a/flash/text/embeddings/__init__.py b/flash/text/embeddings/__init__.py
new file mode 100644
index 0000000000..c49d81d2ae
--- /dev/null
+++ b/flash/text/embeddings/__init__.py
@@ -0,0 +1 @@
+from flash.text.embeddings.model import SentenceEmbedder
diff --git a/flash/text/embeddings/backbones.py b/flash/text/embeddings/backbones.py
new file mode 100644
index 0000000000..7ecc98adcc
--- /dev/null
+++ b/flash/text/embeddings/backbones.py
@@ -0,0 +1,14 @@
+from flash.core.registry import ExternalRegistry, FlashRegistry
+from flash.core.utilities.imports import _TEXT_AVAILABLE
+from flash.core.utilities.providers import _SENTENCE_TRANSFORMERS
+
+SENTENCE_TRANSFORMERS_BACKBONE = FlashRegistry("backbones")
+
+if _TEXT_AVAILABLE:
+    from sentence_transformers import SentenceTransformer
+
+    SENTENCE_TRANSFORMERS_BACKBONE += ExternalRegistry(
+        SentenceTransformer,
+        "backbones",
+        _SENTENCE_TRANSFORMERS,
+    )
diff --git a/flash/text/embeddings/data.py b/flash/text/embeddings/data.py
new file mode 100644
index 0000000000..16a20a3dac
--- /dev/null
+++ b/flash/text/embeddings/data.py
@@ -0,0 +1 @@
+from typing import Callable, Dict, Optional, Union
diff --git a/flash/text/embeddings/model.py b/flash/text/embeddings/model.py
new file mode 100644
index 0000000000..be05a77ebb
--- /dev/null
+++ b/flash/text/embeddings/model.py
@@ -0,0 +1,137 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import warnings
+from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
+
+import numpy as np
+import torch
+import torchmetrics
+from pytorch_lightning import Callback
+from pytorch_lightning.utilities import rank_zero_info
+from sentence_transformers import SentenceTransformer
+from torch import nn, Tensor
+from torch.optim.lr_scheduler import _LRScheduler
+from torchmetrics import Metric
+
+from flash.core.data.data_source import DefaultDataKeys
+from flash.core.data.process import Serializer
+from flash.core.finetuning import FlashBaseFinetuning
+from flash.core.model import Task
+from flash.core.registry import FlashRegistry
+from flash.text.embeddings.backbones import SENTENCE_TRANSFORMERS_BACKBONE
+
+
+class SentenceEmbedder(Task):
+    """The ``SentenceEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and
+    validation. For more details, see `embeddings`.
+
+    You can change the backbone to any question answering model from `UKPLab/sentence-transformers
+    <https://github.com/UKPLab/sentence-transformers>`_ using the ``backbone``
+    argument.
+
+    .. note:: When changing the backbone, make sure you pass in the same backbone to the :class:`~flash.Task` and the
+        :class:`~flash.core.data.data_module.DataModule` object! Since this is a Sentence Transformers task, make sure you
+        use a Sentence Transformers model.
+
+    Args:
+        backbone: backbone model to use for the task.
+        loss_fn: Loss function for training.
+        optimizer: Optimizer to use for training, defaults to `torch.optim.Adam`.
+        optimizer_kwargs: Additional kwargs to use when creating the optimizer (if not passed as an instance).
+        scheduler: The scheduler or scheduler class to use.
+        scheduler_kwargs: Additional kwargs to use when creating the scheduler (if not passed as an instance).
+        metrics: Metrics to compute for training and evaluation. Defauls to calculating the ROUGE metric.
+            Changing this argument currently has no effect.
+        learning_rate: Learning rate to use for training, defaults to `3e-4`
+        enable_ort: Enable Torch ONNX Runtime Optimization: https://onnxruntime.ai/docs/#onnx-runtime-for-training
+    """
+
+    required_extras: str = "text"
+
+    backbones: FlashRegistry = SENTENCE_TRANSFORMERS_BACKBONE
+
+    def __init__(
+        self,
+        backbone: str = "all-MiniLM-L6-v2",
+        enable_ort: bool = False,
+    ):
+
+        os.environ["TOKENIZERS_PARALLELISM"] = "TRUE"
+        # disable HF thousand warnings
+        warnings.simplefilter("ignore")
+        # set os environ variable for multiprocesses
+        os.environ["PYTHONWARNINGS"] = "ignore"
+        super().__init__(
+        )
+        self.model = self.backbones.get(backbone)()
+
+    def generate_embeddings(
+        self,
+        sentences: Union[str, List[str]],
+        batch_size: int = 32,
+        show_progress_bar: bool = None,
+        output_value: str = "sentence_embedding",
+        convert_to_numpy: bool = True,
+        convert_to_tensor: bool = False,
+        device: str = None,
+        normalize_embeddings: bool = False,
+    ) -> Union[List[Tensor], np.ndarray, Tensor]:
+
+        return self.model.encode(
+            sentences=sentences,
+            batch_size=batch_size,
+            show_progress_bar=show_progress_bar,
+            output_value=output_value,
+            convert_to_numpy=convert_to_numpy,
+            convert_to_tensor=convert_to_tensor,
+            device=device,
+            normalize_embeddings=normalize_embeddings,
+        )
+
+    @property
+    def backbone(self):
+        return self.model.base_model
+
+    def training_step(self, batch: Any, batch_idx: int) -> Any:
+        """For the training step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` and
+        :attr:`~flash.core.data.data_source.DefaultDataKeys.TARGET` keys from the input and forward them to the
+        :meth:`~flash.core.model.Task.training_step`."""
+        batch = (batch[DefaultDataKeys.INPUT], batch[DefaultDataKeys.TARGET])
+        return super().training_step(batch, batch_idx)
+
+    def validation_step(self, batch: Any, batch_idx: int) -> Any:
+        """For the validation step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` and
+        :attr:`~flash.core.data.data_source.DefaultDataKeys.TARGET` keys from the input and forward them to the
+        :meth:`~flash.core.model.Task.validation_step`."""
+        batch = (batch[DefaultDataKeys.INPUT], batch[DefaultDataKeys.TARGET])
+        return super().validation_step(batch, batch_idx)
+
+    def test_step(self, batch: Any, batch_idx: int) -> Any:
+        """For the test step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` and
+        :attr:`~flash.core.data.data_source.DefaultDataKeys.TARGET` keys from the input and forward them to the
+        :meth:`~flash.core.model.Task.test_step`."""
+        batch = (batch[DefaultDataKeys.INPUT], batch[DefaultDataKeys.TARGET])
+        return super().test_step(batch, batch_idx)
+
+    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
+        """For the predict step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` key
+        from the input and forward it to the :meth:`~flash.core.model.Task.predict_step`."""
+        batch = batch[DefaultDataKeys.INPUT]
+        return super().predict_step(batch, batch_idx, dataloader_idx=dataloader_idx)
+
+    def forward(self, x) -> torch.Tensor:
+        """First call the backbone, then the model head."""
+        x = self.backbone(x)
+        return self.head(x)

From 83fbf1efafe416c7c6061b4da6400b08ef43eba2 Mon Sep 17 00:00:00 2001
From: Abhijith Neil Abraham <abhijithneilabrahampk@gmail.com>
Date: Wed, 24 Nov 2021 21:08:05 +0530
Subject: [PATCH 02/23] remove train, test and pred step

---
 flash/text/embeddings/model.py | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/flash/text/embeddings/model.py b/flash/text/embeddings/model.py
index be05a77ebb..828a62087f 100644
--- a/flash/text/embeddings/model.py
+++ b/flash/text/embeddings/model.py
@@ -104,33 +104,6 @@ def generate_embeddings(
     def backbone(self):
         return self.model.base_model
 
-    def training_step(self, batch: Any, batch_idx: int) -> Any:
-        """For the training step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` and
-        :attr:`~flash.core.data.data_source.DefaultDataKeys.TARGET` keys from the input and forward them to the
-        :meth:`~flash.core.model.Task.training_step`."""
-        batch = (batch[DefaultDataKeys.INPUT], batch[DefaultDataKeys.TARGET])
-        return super().training_step(batch, batch_idx)
-
-    def validation_step(self, batch: Any, batch_idx: int) -> Any:
-        """For the validation step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` and
-        :attr:`~flash.core.data.data_source.DefaultDataKeys.TARGET` keys from the input and forward them to the
-        :meth:`~flash.core.model.Task.validation_step`."""
-        batch = (batch[DefaultDataKeys.INPUT], batch[DefaultDataKeys.TARGET])
-        return super().validation_step(batch, batch_idx)
-
-    def test_step(self, batch: Any, batch_idx: int) -> Any:
-        """For the test step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` and
-        :attr:`~flash.core.data.data_source.DefaultDataKeys.TARGET` keys from the input and forward them to the
-        :meth:`~flash.core.model.Task.test_step`."""
-        batch = (batch[DefaultDataKeys.INPUT], batch[DefaultDataKeys.TARGET])
-        return super().test_step(batch, batch_idx)
-
-    def predict_step(self, batch: Any, batch_idx: int, dataloader_idx: int = 0) -> Any:
-        """For the predict step, we just extract the :attr:`~flash.core.data.data_source.DefaultDataKeys.INPUT` key
-        from the input and forward it to the :meth:`~flash.core.model.Task.predict_step`."""
-        batch = batch[DefaultDataKeys.INPUT]
-        return super().predict_step(batch, batch_idx, dataloader_idx=dataloader_idx)
-
     def forward(self, x) -> torch.Tensor:
         """First call the backbone, then the model head."""
         x = self.backbone(x)

From 4ad3abb2ccd1e44d852a0dc1232c4a655ca7b886 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 24 Nov 2021 15:39:33 +0000
Subject: [PATCH 03/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 flash/text/embeddings/model.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/flash/text/embeddings/model.py b/flash/text/embeddings/model.py
index 828a62087f..9b5d22cdf4 100644
--- a/flash/text/embeddings/model.py
+++ b/flash/text/embeddings/model.py
@@ -73,8 +73,7 @@ def __init__(
         warnings.simplefilter("ignore")
         # set os environ variable for multiprocesses
         os.environ["PYTHONWARNINGS"] = "ignore"
-        super().__init__(
-        )
+        super().__init__()
         self.model = self.backbones.get(backbone)()
 
     def generate_embeddings(

From 8cd939567247e987bae267335a07988557862e85 Mon Sep 17 00:00:00 2001
From: Abhijith Neil Abraham <abhijithneilabrahampk@gmail.com>
Date: Sun, 5 Dec 2021 17:06:23 +0530
Subject: [PATCH 04/23] sentence embedders with forward step and predict step

---
 flash/text/embeddings/backbones.py |  20 ++-
 flash/text/embeddings/data.py      | 221 ++++++++++++++++++++++++++++-
 flash/text/embeddings/model.py     | 161 ++++++++++++++++-----
 requirements.txt                   |   1 +
 4 files changed, 363 insertions(+), 40 deletions(-)

diff --git a/flash/text/embeddings/backbones.py b/flash/text/embeddings/backbones.py
index 7ecc98adcc..002b1c69d3 100644
--- a/flash/text/embeddings/backbones.py
+++ b/flash/text/embeddings/backbones.py
@@ -1,14 +1,24 @@
 from flash.core.registry import ExternalRegistry, FlashRegistry
 from flash.core.utilities.imports import _TEXT_AVAILABLE
-from flash.core.utilities.providers import _SENTENCE_TRANSFORMERS
+from flash.core.utilities.providers import _HUGGINGFACE
 
 SENTENCE_TRANSFORMERS_BACKBONE = FlashRegistry("backbones")
 
 if _TEXT_AVAILABLE:
-    from sentence_transformers import SentenceTransformer
+    from transformers import AutoModel, AutoTokenizer, AutoConfig
 
-    SENTENCE_TRANSFORMERS_BACKBONE += ExternalRegistry(
-        SentenceTransformer,
+    AUTOMODEL_BACKBONE = ExternalRegistry(
+        AutoModel.from_pretrained,
         "backbones",
-        _SENTENCE_TRANSFORMERS,
+        _HUGGINGFACE,
     )
+    AUTOTOKENIZER_BACKBONE= ExternalRegistry(
+        AutoTokenizer.from_pretrained,
+        "backbones",
+        _HUGGINGFACE,
+    )
+    AUTOCONFIG_BACKBONE=ExternalRegistry(
+                AutoConfig.from_pretrained,
+                "backbones",
+                _HUGGINGFACE,
+            )
diff --git a/flash/text/embeddings/data.py b/flash/text/embeddings/data.py
index 16a20a3dac..84232df9e0 100644
--- a/flash/text/embeddings/data.py
+++ b/flash/text/embeddings/data.py
@@ -1 +1,220 @@
-from typing import Callable, Dict, Optional, Union
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
+
+import torch
+from pandas.core.frame import DataFrame
+
+import flash
+from flash.core.data.auto_dataset import AutoDataset
+from flash.core.data.io.input import DataKeys, Input, LabelsState
+from flash.core.utilities.imports import _TEXT_AVAILABLE, requires
+
+if _TEXT_AVAILABLE:
+    from datasets import Dataset, load_dataset
+    from transformers import AutoTokenizer
+    
+
+
+class TextInput(Input):
+    @requires("text")
+    def __init__(self, backbone: str, max_length: int = 128):
+        super().__init__()
+
+        self.backbone = backbone
+        self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=True)
+        self.max_length = max_length
+
+    def _tokenize_fn(
+        self,
+        ex: Union[Dict[str, str], str],
+        input: Optional[str] = None,
+    ) -> Callable:
+        """This function is used to tokenize sentences using the provided tokenizer."""
+        return self.tokenizer(ex[input], max_length=self.max_length, truncation=True, padding="max_length")
+
+    @staticmethod
+    def _transform_label(label_to_class_mapping: Dict[str, int], target: str, ex: Dict[str, Union[int, str]]):
+        ex[target] = label_to_class_mapping[ex[target]]
+        return ex
+
+    @staticmethod
+    def _multilabel_target(targets: List[str], element: Dict[str, Any]) -> Dict[str, Any]:
+        targets = [element.pop(target) for target in targets]
+        element[DataKeys.TARGET] = targets
+        return element
+
+    def _to_hf_dataset(self, data) -> Sequence[Mapping[str, Any]]:
+        """account for flash CI testing context."""
+        hf_dataset, *other = self.to_hf_dataset(data)
+
+        if flash._IS_TESTING and not torch.cuda.is_available():
+            # NOTE: must subset in this way to return a Dataset
+            hf_dataset = hf_dataset.select(range(20))
+
+        return (hf_dataset, *other)
+
+    def load_data(
+        self,
+        data: Tuple[str, Union[str, List[str]], Union[str, List[str]]],
+        dataset: Optional[Any] = None,
+    ) -> Sequence[Mapping[str, Any]]:
+        """Loads data into HuggingFace datasets.Dataset."""
+
+        hf_dataset, input, *other = self._to_hf_dataset(data)
+
+        if not self.predicting:
+            target: Union[str, List[str]] = other.pop()
+            if isinstance(target, List):
+                # multi-target
+                dataset.multi_label = True
+                hf_dataset = hf_dataset.map(partial(self._multilabel_target, target))  # NOTE: renames target column
+                dataset.num_classes = len(target)
+                self.set_state(LabelsState(target))
+            else:
+                dataset.multi_label = False
+                if self.training:
+                    labels = list(sorted(list(set(hf_dataset[target]))))
+                    dataset.num_classes = len(labels)
+                    self.set_state(LabelsState(labels))
+
+                labels = self.get_state(LabelsState)
+
+                # convert labels to ids (note: the target column get overwritten)
+                if labels is not None:
+                    labels = labels.labels
+                    label_to_class_mapping = {v: k for k, v in enumerate(labels)}
+                    hf_dataset = hf_dataset.map(partial(self._transform_label, label_to_class_mapping, target))
+
+                # rename label column
+                hf_dataset = hf_dataset.rename_column(target, DataKeys.TARGET)
+
+        # remove extra columns
+        extra_columns = set(hf_dataset.column_names) - {input, DataKeys.TARGET}
+        hf_dataset = hf_dataset.remove_columns(extra_columns)
+
+        # tokenize
+        hf_dataset = hf_dataset.map(partial(self._tokenize_fn, input=input), batched=True, remove_columns=[input])
+
+        # set format
+        hf_dataset.set_format("torch")
+
+        return hf_dataset
+
+    def predict_load_data(self, data: Any, dataset: AutoDataset):
+        return self.load_data(data, dataset)
+
+    def __getstate__(self):  # TODO: Find out why this is being pickled
+        state = self.__dict__.copy()
+        state.pop("tokenizer")
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True)
+
+
+class TextCSVInput(TextInput):
+    def to_hf_dataset(self, data: Tuple[str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]:
+        file, *other = data
+        dataset_dict = load_dataset("csv", data_files={"train": str(file)})
+        return (dataset_dict["train"], *other)
+
+
+class TextJSONInput(TextInput):
+    def to_hf_dataset(self, data: Tuple[str, str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]:
+        file, *other, field = data
+        dataset_dict = load_dataset("json", data_files={"train": str(file)}, field=field)
+        return (dataset_dict["train"], *other)
+
+
+class TextDataFrameInput(TextInput):
+    def to_hf_dataset(self, data: Tuple[DataFrame, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]:
+        df, *other = data
+        hf_dataset = Dataset.from_pandas(df)
+        return (hf_dataset, *other)
+
+
+class TextParquetInput(TextInput):
+    def to_hf_dataset(self, data: Tuple[str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]:
+        file, *other = data
+        hf_dataset = Dataset.from_parquet(str(file))
+        return (hf_dataset, *other)
+
+
+class TextHuggingFaceDatasetInput(TextInput):
+    def to_hf_dataset(self, data: Tuple[str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]:
+        hf_dataset, *other = data
+        return (hf_dataset, *other)
+
+
+class TextListInput(TextInput):
+    def to_hf_dataset(
+        self, data: Union[Tuple[List[str], List[str]], List[str]]
+    ) -> Tuple[Sequence[Mapping[str, Any]], Optional[List[str]]]:
+
+        if isinstance(data, tuple):
+            input_list, target_list = data
+            # NOTE: here we already deal with multilabels
+            # NOTE: here we already rename to correct column names
+            hf_dataset = Dataset.from_dict({DataKeys.INPUT: input_list, DataKeys.TARGET: target_list})
+            return hf_dataset, target_list
+
+        # predicting
+        hf_dataset = Dataset.from_dict({DataKeys.INPUT: data})
+
+        return (hf_dataset,)
+
+    def load_data(
+        self,
+        data: Tuple[List[str], Union[List[Any], List[List[Any]]]],
+        dataset: Optional[Any] = None,
+    ) -> Sequence[Mapping[str, Any]]:
+
+        hf_dataset, *other = self._to_hf_dataset(data)
+
+        if not self.predicting:
+            target_list = other.pop()
+            if isinstance(target_list[0], List):
+                # multi-target_list
+                dataset.multi_label = True
+                dataset.num_classes = len(target_list[0])
+                self.set_state(LabelsState(target_list))
+            else:
+                dataset.multi_label = False
+                if self.training:
+                    labels = list(sorted(list(set(hf_dataset[DataKeys.TARGET]))))
+                    dataset.num_classes = len(labels)
+                    self.set_state(LabelsState(labels))
+
+                labels = self.get_state(LabelsState)
+
+                # convert labels to ids
+                if labels is not None:
+                    labels = labels.labels
+                    label_to_class_mapping = {v: k for k, v in enumerate(labels)}
+                    # happens in-place and keeps the target column name
+                    hf_dataset = hf_dataset.map(partial(self._transform_label, label_to_class_mapping, DataKeys.TARGET))
+
+        # tokenize
+        hf_dataset = hf_dataset.map(partial(self._tokenize_fn, input=DataKeys.INPUT), batched=True)
+
+        # set format
+        hf_dataset = hf_dataset.remove_columns([DataKeys.INPUT])  # just leave the numerical columns
+        hf_dataset.set_format("torch")
+
+        return hf_dataset
+
+
diff --git a/flash/text/embeddings/model.py b/flash/text/embeddings/model.py
index 9b5d22cdf4..090a581d90 100644
--- a/flash/text/embeddings/model.py
+++ b/flash/text/embeddings/model.py
@@ -14,24 +14,24 @@
 import os
 import warnings
 from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
-
+import logging
 import numpy as np
 import torch
 import torchmetrics
 from pytorch_lightning import Callback
 from pytorch_lightning.utilities import rank_zero_info
-from sentence_transformers import SentenceTransformer
 from torch import nn, Tensor
 from torch.optim.lr_scheduler import _LRScheduler
 from torchmetrics import Metric
-
+from tqdm.autonotebook import trange
 from flash.core.data.data_source import DefaultDataKeys
 from flash.core.data.process import Serializer
 from flash.core.finetuning import FlashBaseFinetuning
 from flash.core.model import Task
 from flash.core.registry import FlashRegistry
-from flash.text.embeddings.backbones import SENTENCE_TRANSFORMERS_BACKBONE
+from flash.text.embeddings.backbones import AUTOMODEL_BACKBONE,AUTOTOKENIZER_BACKBONE,AUTOCONFIG_BACKBONE
 
+logger = logging.getLogger(__name__)
 
 class SentenceEmbedder(Task):
     """The ``SentenceEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and
@@ -60,50 +60,143 @@ class SentenceEmbedder(Task):
 
     required_extras: str = "text"
 
-    backbones: FlashRegistry = SENTENCE_TRANSFORMERS_BACKBONE
+    AutoModel_Backbones: FlashRegistry = AUTOMODEL_BACKBONE
+    AutoTokenizer_Backbones: FlashRegistry = AUTOTOKENIZER_BACKBONE
+    AutoConfig_Backbones: FlashRegistry = AUTOCONFIG_BACKBONE
 
     def __init__(
         self,
-        backbone: str = "all-MiniLM-L6-v2",
+        model_backbone: str = "sentence-transformers/all-MiniLM-L6-v2",
+        tokenizer_backbone:str = "sentence-transformers/all-MiniLM-L6-v2",
         enable_ort: bool = False,
     ):
-
         os.environ["TOKENIZERS_PARALLELISM"] = "TRUE"
         # disable HF thousand warnings
         warnings.simplefilter("ignore")
         # set os environ variable for multiprocesses
         os.environ["PYTHONWARNINGS"] = "ignore"
         super().__init__()
-        self.model = self.backbones.get(backbone)()
-
-    def generate_embeddings(
-        self,
-        sentences: Union[str, List[str]],
-        batch_size: int = 32,
-        show_progress_bar: bool = None,
-        output_value: str = "sentence_embedding",
-        convert_to_numpy: bool = True,
-        convert_to_tensor: bool = False,
-        device: str = None,
-        normalize_embeddings: bool = False,
-    ) -> Union[List[Tensor], np.ndarray, Tensor]:
-
-        return self.model.encode(
-            sentences=sentences,
-            batch_size=batch_size,
-            show_progress_bar=show_progress_bar,
-            output_value=output_value,
-            convert_to_numpy=convert_to_numpy,
-            convert_to_tensor=convert_to_tensor,
-            device=device,
-            normalize_embeddings=normalize_embeddings,
-        )
+        
+        self.config=self.AutoConfig_Backbones.get(model_backbone)
+        self.auto_model = self.AutoModel_Backbones.get(model_backbone)
+        self.tokenzier=self.AutoTokenizer_Backbones.get(tokenizer_backbone)
+        
+        if tokenizer_backbone is not None:
+            self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__
+        
+    def predict_step(self, sentences: Union[str, List[str]],
+                   batch_size: int = 32,
+                   show_progress_bar: bool = None,
+                   output_value: str = 'sentence_embedding',
+                   convert_to_numpy: bool = True,
+                   convert_to_tensor: bool = False,
+                   device: str = None,
+                   normalize_embeddings: bool = False) -> Union[List[Tensor], np.ndarray, Tensor]:
+            """
+            Computes sentence embeddings
+            :param sentences: the sentences to embed
+            :param batch_size: the batch size used for the computation
+            :param show_progress_bar: Output a progress bar when encode sentences
+            :param output_value:  Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
+            :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
+            :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
+            :param device: Which torch.device to use for the computation
+            :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
+            :return:
+               By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
+            """
+            self.eval()
+            if show_progress_bar is None:
+                show_progress_bar = (logger.getEffectiveLevel()==logging.INFO or logger.getEffectiveLevel()==logging.DEBUG)
+    
+            if convert_to_tensor:
+                convert_to_numpy = False
+    
+            if output_value != 'sentence_embedding':
+                convert_to_tensor = False
+                convert_to_numpy = False
+    
+            input_was_string = False
+            if isinstance(sentences, str) or not hasattr(sentences, '__len__'): #Cast an individual sentence to a list with length 1
+                sentences = [sentences]
+                input_was_string = True
+    
+            if device is None:
+                device = self._target_device
+    
+            self.to(device)
+    
+            all_embeddings = []
+            length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
+            sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
+    
+            for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
+                sentences_batch = sentences_sorted[start_index:start_index+batch_size]
+                features = self.tokenize(sentences_batch)
+                
+                with torch.no_grad():
+                    out_features = self.forward(features)
+    
+                    if output_value == 'token_embeddings':
+                        embeddings = []
+                        for token_emb, attention in zip(out_features[output_value], out_features['attention_mask']):
+                            last_mask_id = len(attention)-1
+                            while last_mask_id > 0 and attention[last_mask_id].item() == 0:
+                                last_mask_id -= 1
+    
+                            embeddings.append(token_emb[0:last_mask_id+1])
+                    elif output_value is None:  #Return all outputs
+                        embeddings = []
+                        for sent_idx in range(len(out_features['sentence_embedding'])):
+                            row =  {name: out_features[name][sent_idx] for name in out_features}
+                            embeddings.append(row)
+                    else:   #Sentence embeddings
+                        embeddings = out_features[output_value]
+                        embeddings = embeddings.detach()
+                        if normalize_embeddings:
+                            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+    
+                        # fixes for #522 and #487 to avoid oom problems on gpu with large datasets
+                        if convert_to_numpy:
+                            embeddings = embeddings.cpu()
+    
+                    all_embeddings.extend(embeddings)
+    
+            all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)]
+    
+            if convert_to_tensor:
+                all_embeddings = torch.stack(all_embeddings)
+            elif convert_to_numpy:
+                all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
+    
+            if input_was_string:
+                all_embeddings = all_embeddings[0]
+    
+            return all_embeddings
 
     @property
     def backbone(self):
         return self.model.base_model
 
-    def forward(self, x) -> torch.Tensor:
+    def forward(self,batch: Dict[str, torch.Tensor]) -> torch.Tensor:
         """First call the backbone, then the model head."""
-        x = self.backbone(x)
-        return self.head(x)
+
+        trans_features = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
+        if 'token_type_ids' in batch:
+            trans_features['token_type_ids'] = batch['token_type_ids']
+ 
+        output_states = self.auto_model(**trans_features, return_dict=False)
+        output_tokens = output_states[0]
+ 
+        batch.update({'token_embeddings': output_tokens, 'attention_mask': batch['attention_mask']})
+ 
+        if self.auto_model.config.output_hidden_states:
+            all_layer_idx = 2
+            if len(output_states) < 3: #Some models only output last_hidden_states and all_hidden_states
+                all_layer_idx = 1
+ 
+            hidden_states = output_states[all_layer_idx]
+            batch.update({'all_layer_embeddings': hidden_states})
+ 
+        return batch
+
diff --git a/requirements.txt b/requirements.txt
index 0920edcb4c..333125ff76 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,3 +7,4 @@ pyDeprecate
 pandas>=1.1.0
 jsonargparse[signatures]>=3.17.0
 click>=7.1.2
+tqdm

From 34f39d1bd1c049875486918d7b5ae4818b1eb28a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 5 Dec 2021 11:37:21 +0000
Subject: [PATCH 05/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 flash/text/embeddings/backbones.py |  14 +-
 flash/text/embeddings/data.py      |   3 -
 flash/text/embeddings/model.py     | 229 +++++++++++++++--------------
 3 files changed, 126 insertions(+), 120 deletions(-)

diff --git a/flash/text/embeddings/backbones.py b/flash/text/embeddings/backbones.py
index 002b1c69d3..6fbf4036f2 100644
--- a/flash/text/embeddings/backbones.py
+++ b/flash/text/embeddings/backbones.py
@@ -5,20 +5,20 @@
 SENTENCE_TRANSFORMERS_BACKBONE = FlashRegistry("backbones")
 
 if _TEXT_AVAILABLE:
-    from transformers import AutoModel, AutoTokenizer, AutoConfig
+    from transformers import AutoConfig, AutoModel, AutoTokenizer
 
     AUTOMODEL_BACKBONE = ExternalRegistry(
         AutoModel.from_pretrained,
         "backbones",
         _HUGGINGFACE,
     )
-    AUTOTOKENIZER_BACKBONE= ExternalRegistry(
+    AUTOTOKENIZER_BACKBONE = ExternalRegistry(
         AutoTokenizer.from_pretrained,
         "backbones",
         _HUGGINGFACE,
     )
-    AUTOCONFIG_BACKBONE=ExternalRegistry(
-                AutoConfig.from_pretrained,
-                "backbones",
-                _HUGGINGFACE,
-            )
+    AUTOCONFIG_BACKBONE = ExternalRegistry(
+        AutoConfig.from_pretrained,
+        "backbones",
+        _HUGGINGFACE,
+    )
diff --git a/flash/text/embeddings/data.py b/flash/text/embeddings/data.py
index 84232df9e0..1be078354d 100644
--- a/flash/text/embeddings/data.py
+++ b/flash/text/embeddings/data.py
@@ -25,7 +25,6 @@
 if _TEXT_AVAILABLE:
     from datasets import Dataset, load_dataset
     from transformers import AutoTokenizer
-    
 
 
 class TextInput(Input):
@@ -216,5 +215,3 @@ def load_data(
         hf_dataset.set_format("torch")
 
         return hf_dataset
-
-
diff --git a/flash/text/embeddings/model.py b/flash/text/embeddings/model.py
index 090a581d90..f1b2165fae 100644
--- a/flash/text/embeddings/model.py
+++ b/flash/text/embeddings/model.py
@@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
 import os
 import warnings
 from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
-import logging
+
 import numpy as np
 import torch
 import torchmetrics
@@ -24,15 +25,17 @@
 from torch.optim.lr_scheduler import _LRScheduler
 from torchmetrics import Metric
 from tqdm.autonotebook import trange
+
 from flash.core.data.data_source import DefaultDataKeys
 from flash.core.data.process import Serializer
 from flash.core.finetuning import FlashBaseFinetuning
 from flash.core.model import Task
 from flash.core.registry import FlashRegistry
-from flash.text.embeddings.backbones import AUTOMODEL_BACKBONE,AUTOTOKENIZER_BACKBONE,AUTOCONFIG_BACKBONE
+from flash.text.embeddings.backbones import AUTOCONFIG_BACKBONE, AUTOMODEL_BACKBONE, AUTOTOKENIZER_BACKBONE
 
 logger = logging.getLogger(__name__)
 
+
 class SentenceEmbedder(Task):
     """The ``SentenceEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and
     validation. For more details, see `embeddings`.
@@ -67,7 +70,7 @@ class SentenceEmbedder(Task):
     def __init__(
         self,
         model_backbone: str = "sentence-transformers/all-MiniLM-L6-v2",
-        tokenizer_backbone:str = "sentence-transformers/all-MiniLM-L6-v2",
+        tokenizer_backbone: str = "sentence-transformers/all-MiniLM-L6-v2",
         enable_ort: bool = False,
     ):
         os.environ["TOKENIZERS_PARALLELISM"] = "TRUE"
@@ -76,127 +79,133 @@ def __init__(
         # set os environ variable for multiprocesses
         os.environ["PYTHONWARNINGS"] = "ignore"
         super().__init__()
-        
-        self.config=self.AutoConfig_Backbones.get(model_backbone)
+
+        self.config = self.AutoConfig_Backbones.get(model_backbone)
         self.auto_model = self.AutoModel_Backbones.get(model_backbone)
-        self.tokenzier=self.AutoTokenizer_Backbones.get(tokenizer_backbone)
-        
+        self.tokenzier = self.AutoTokenizer_Backbones.get(tokenizer_backbone)
+
         if tokenizer_backbone is not None:
             self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__
-        
-    def predict_step(self, sentences: Union[str, List[str]],
-                   batch_size: int = 32,
-                   show_progress_bar: bool = None,
-                   output_value: str = 'sentence_embedding',
-                   convert_to_numpy: bool = True,
-                   convert_to_tensor: bool = False,
-                   device: str = None,
-                   normalize_embeddings: bool = False) -> Union[List[Tensor], np.ndarray, Tensor]:
-            """
-            Computes sentence embeddings
-            :param sentences: the sentences to embed
-            :param batch_size: the batch size used for the computation
-            :param show_progress_bar: Output a progress bar when encode sentences
-            :param output_value:  Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
-            :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
-            :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
-            :param device: Which torch.device to use for the computation
-            :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
-            :return:
-               By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
-            """
-            self.eval()
-            if show_progress_bar is None:
-                show_progress_bar = (logger.getEffectiveLevel()==logging.INFO or logger.getEffectiveLevel()==logging.DEBUG)
-    
-            if convert_to_tensor:
-                convert_to_numpy = False
-    
-            if output_value != 'sentence_embedding':
-                convert_to_tensor = False
-                convert_to_numpy = False
-    
-            input_was_string = False
-            if isinstance(sentences, str) or not hasattr(sentences, '__len__'): #Cast an individual sentence to a list with length 1
-                sentences = [sentences]
-                input_was_string = True
-    
-            if device is None:
-                device = self._target_device
-    
-            self.to(device)
-    
-            all_embeddings = []
-            length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
-            sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
-    
-            for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
-                sentences_batch = sentences_sorted[start_index:start_index+batch_size]
-                features = self.tokenize(sentences_batch)
-                
-                with torch.no_grad():
-                    out_features = self.forward(features)
-    
-                    if output_value == 'token_embeddings':
-                        embeddings = []
-                        for token_emb, attention in zip(out_features[output_value], out_features['attention_mask']):
-                            last_mask_id = len(attention)-1
-                            while last_mask_id > 0 and attention[last_mask_id].item() == 0:
-                                last_mask_id -= 1
-    
-                            embeddings.append(token_emb[0:last_mask_id+1])
-                    elif output_value is None:  #Return all outputs
-                        embeddings = []
-                        for sent_idx in range(len(out_features['sentence_embedding'])):
-                            row =  {name: out_features[name][sent_idx] for name in out_features}
-                            embeddings.append(row)
-                    else:   #Sentence embeddings
-                        embeddings = out_features[output_value]
-                        embeddings = embeddings.detach()
-                        if normalize_embeddings:
-                            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
-    
-                        # fixes for #522 and #487 to avoid oom problems on gpu with large datasets
-                        if convert_to_numpy:
-                            embeddings = embeddings.cpu()
-    
-                    all_embeddings.extend(embeddings)
-    
-            all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)]
-    
-            if convert_to_tensor:
-                all_embeddings = torch.stack(all_embeddings)
-            elif convert_to_numpy:
-                all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
-    
-            if input_was_string:
-                all_embeddings = all_embeddings[0]
-    
-            return all_embeddings
+
+    def predict_step(
+        self,
+        sentences: Union[str, List[str]],
+        batch_size: int = 32,
+        show_progress_bar: bool = None,
+        output_value: str = "sentence_embedding",
+        convert_to_numpy: bool = True,
+        convert_to_tensor: bool = False,
+        device: str = None,
+        normalize_embeddings: bool = False,
+    ) -> Union[List[Tensor], np.ndarray, Tensor]:
+        """Computes sentence embeddings.
+
+        :param sentences: the sentences to embed
+        :param batch_size: the batch size used for the computation
+        :param show_progress_bar: Output a progress bar when encode sentences
+        :param output_value:  Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
+        :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
+        :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
+        :param device: Which torch.device to use for the computation
+        :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
+        :return:
+           By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
+        """
+        self.eval()
+        if show_progress_bar is None:
+            show_progress_bar = (
+                logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
+            )
+
+        if convert_to_tensor:
+            convert_to_numpy = False
+
+        if output_value != "sentence_embedding":
+            convert_to_tensor = False
+            convert_to_numpy = False
+
+        input_was_string = False
+        if isinstance(sentences, str) or not hasattr(
+            sentences, "__len__"
+        ):  # Cast an individual sentence to a list with length 1
+            sentences = [sentences]
+            input_was_string = True
+
+        if device is None:
+            device = self._target_device
+
+        self.to(device)
+
+        all_embeddings = []
+        length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
+        sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
+
+        for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
+            sentences_batch = sentences_sorted[start_index : start_index + batch_size]
+            features = self.tokenize(sentences_batch)
+
+            with torch.no_grad():
+                out_features = self.forward(features)
+
+                if output_value == "token_embeddings":
+                    embeddings = []
+                    for token_emb, attention in zip(out_features[output_value], out_features["attention_mask"]):
+                        last_mask_id = len(attention) - 1
+                        while last_mask_id > 0 and attention[last_mask_id].item() == 0:
+                            last_mask_id -= 1
+
+                        embeddings.append(token_emb[0 : last_mask_id + 1])
+                elif output_value is None:  # Return all outputs
+                    embeddings = []
+                    for sent_idx in range(len(out_features["sentence_embedding"])):
+                        row = {name: out_features[name][sent_idx] for name in out_features}
+                        embeddings.append(row)
+                else:  # Sentence embeddings
+                    embeddings = out_features[output_value]
+                    embeddings = embeddings.detach()
+                    if normalize_embeddings:
+                        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
+
+                    # fixes for #522 and #487 to avoid oom problems on gpu with large datasets
+                    if convert_to_numpy:
+                        embeddings = embeddings.cpu()
+
+                all_embeddings.extend(embeddings)
+
+        all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)]
+
+        if convert_to_tensor:
+            all_embeddings = torch.stack(all_embeddings)
+        elif convert_to_numpy:
+            all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
+
+        if input_was_string:
+            all_embeddings = all_embeddings[0]
+
+        return all_embeddings
 
     @property
     def backbone(self):
         return self.model.base_model
 
-    def forward(self,batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+    def forward(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
         """First call the backbone, then the model head."""
 
-        trans_features = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask']}
-        if 'token_type_ids' in batch:
-            trans_features['token_type_ids'] = batch['token_type_ids']
- 
+        trans_features = {"input_ids": batch["input_ids"], "attention_mask": batch["attention_mask"]}
+        if "token_type_ids" in batch:
+            trans_features["token_type_ids"] = batch["token_type_ids"]
+
         output_states = self.auto_model(**trans_features, return_dict=False)
         output_tokens = output_states[0]
- 
-        batch.update({'token_embeddings': output_tokens, 'attention_mask': batch['attention_mask']})
- 
+
+        batch.update({"token_embeddings": output_tokens, "attention_mask": batch["attention_mask"]})
+
         if self.auto_model.config.output_hidden_states:
             all_layer_idx = 2
-            if len(output_states) < 3: #Some models only output last_hidden_states and all_hidden_states
+            if len(output_states) < 3:  # Some models only output last_hidden_states and all_hidden_states
                 all_layer_idx = 1
- 
+
             hidden_states = output_states[all_layer_idx]
-            batch.update({'all_layer_embeddings': hidden_states})
- 
-        return batch
+            batch.update({"all_layer_embeddings": hidden_states})
 
+        return batch

From 219042aee524c652c4ad73e0dbf2e13d780707d0 Mon Sep 17 00:00:00 2001
From: Abhijith Neil Abraham <abhijithneilabrahampk@gmail.com>
Date: Sun, 5 Dec 2021 22:23:51 +0530
Subject: [PATCH 06/23] Update __init__.py

---
 flash/text/embeddings/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flash/text/embeddings/__init__.py b/flash/text/embeddings/__init__.py
index c49d81d2ae..5232963676 100644
--- a/flash/text/embeddings/__init__.py
+++ b/flash/text/embeddings/__init__.py
@@ -1 +1,2 @@
 from flash.text.embeddings.model import SentenceEmbedder
+from flash.text.embeddings.data import TextInput,TextCSVInput,TextJSONInput,TextDataFrameInput,TextParquetInput,TextHuggingFaceDatasetInput,TextListInput

From 4b3c772b7e905e35d2dd1d4c78359ab5b139fcf3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 5 Dec 2021 16:54:39 +0000
Subject: [PATCH 07/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 flash/text/embeddings/__init__.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/flash/text/embeddings/__init__.py b/flash/text/embeddings/__init__.py
index 5232963676..3993b0a526 100644
--- a/flash/text/embeddings/__init__.py
+++ b/flash/text/embeddings/__init__.py
@@ -1,2 +1,10 @@
+from flash.text.embeddings.data import (
+    TextCSVInput,
+    TextDataFrameInput,
+    TextHuggingFaceDatasetInput,
+    TextInput,
+    TextJSONInput,
+    TextListInput,
+    TextParquetInput,
+)
 from flash.text.embeddings.model import SentenceEmbedder
-from flash.text.embeddings.data import TextInput,TextCSVInput,TextJSONInput,TextDataFrameInput,TextParquetInput,TextHuggingFaceDatasetInput,TextListInput

From 06e35bdc664a103c13c0532fea681d700605c6a1 Mon Sep 17 00:00:00 2001
From: Ethan Harris <ethanwharris@gmail.com>
Date: Wed, 8 Dec 2021 17:15:17 +0000
Subject: [PATCH 08/23] Updates

---
 flash/core/utilities/imports.py    |   2 +
 flash/text/__init__.py             |   1 +
 flash/text/embedding/__init__.py   |   1 +
 flash/text/embedding/backbones.py  |  14 ++
 flash/text/embedding/model.py      | 108 ++++++++++++++
 flash/text/embeddings/__init__.py  |  10 --
 flash/text/embeddings/backbones.py |  24 ----
 flash/text/embeddings/data.py      | 217 -----------------------------
 flash/text/embeddings/model.py     | 211 ----------------------------
 flash_examples/text_embedder.py    |  37 +++++
 requirements/datatype_text.txt     |   1 +
 11 files changed, 164 insertions(+), 462 deletions(-)
 create mode 100644 flash/text/embedding/__init__.py
 create mode 100644 flash/text/embedding/backbones.py
 create mode 100644 flash/text/embedding/model.py
 delete mode 100644 flash/text/embeddings/__init__.py
 delete mode 100644 flash/text/embeddings/backbones.py
 delete mode 100644 flash/text/embeddings/data.py
 delete mode 100644 flash/text/embeddings/model.py
 create mode 100644 flash_examples/text_embedder.py

diff --git a/flash/core/utilities/imports.py b/flash/core/utilities/imports.py
index 581c5cd719..082731f8d0 100644
--- a/flash/core/utilities/imports.py
+++ b/flash/core/utilities/imports.py
@@ -107,6 +107,7 @@ def _compare_version(package: str, op, version) -> bool:
 _ALBUMENTATIONS_AVAILABLE = _module_available("albumentations")
 _BAAL_AVAILABLE = _module_available("baal")
 _TORCH_OPTIMIZER_AVAILABLE = _module_available("torch_optimizer")
+_SENTENCE_TRANSFORMERS_AVAILABLE = _module_available("sentence_transformers")
 
 
 if _PIL_AVAILABLE:
@@ -130,6 +131,7 @@ class Image:
         _SENTENCEPIECE_AVAILABLE,
         _DATASETS_AVAILABLE,
         _TM_TEXT_AVAILABLE,
+        _SENTENCE_TRANSFORMERS_AVAILABLE,
     ]
 )
 _TABULAR_AVAILABLE = _TABNET_AVAILABLE and _PANDAS_AVAILABLE and _FORECASTING_AVAILABLE
diff --git a/flash/text/__init__.py b/flash/text/__init__.py
index 7a17659b20..bf87427c9f 100644
--- a/flash/text/__init__.py
+++ b/flash/text/__init__.py
@@ -1,4 +1,5 @@
 from flash.text.classification import TextClassificationData, TextClassifier  # noqa: F401
+from flash.text.embedding import SentenceEmbedder  # noqa: F401
 from flash.text.question_answering import QuestionAnsweringData, QuestionAnsweringTask  # noqa: F401
 from flash.text.seq2seq import (  # noqa: F401
     Seq2SeqData,
diff --git a/flash/text/embedding/__init__.py b/flash/text/embedding/__init__.py
new file mode 100644
index 0000000000..42bc49dd26
--- /dev/null
+++ b/flash/text/embedding/__init__.py
@@ -0,0 +1 @@
+from flash.text.embedding.model import SentenceEmbedder  # noqa: F401
diff --git a/flash/text/embedding/backbones.py b/flash/text/embedding/backbones.py
new file mode 100644
index 0000000000..c421e0179e
--- /dev/null
+++ b/flash/text/embedding/backbones.py
@@ -0,0 +1,14 @@
+from flash.core.registry import ExternalRegistry, FlashRegistry
+from flash.core.utilities.imports import _TEXT_AVAILABLE
+from flash.core.utilities.providers import _HUGGINGFACE
+
+if _TEXT_AVAILABLE:
+    from transformers import AutoModel
+
+    HUGGINGFACE_BACKBONES = ExternalRegistry(
+        AutoModel.from_pretrained,
+        "backbones",
+        _HUGGINGFACE,
+    )
+else:
+    HUGGINGFACE_BACKBONES = FlashRegistry("backbones")
diff --git a/flash/text/embedding/model.py b/flash/text/embedding/model.py
new file mode 100644
index 0000000000..c37a98d8b0
--- /dev/null
+++ b/flash/text/embedding/model.py
@@ -0,0 +1,108 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import os
+import warnings
+from typing import Any, Dict, List, Optional
+
+import torch
+from pytorch_lightning import Callback
+
+from flash.core.integrations.transformers.states import TransformersBackboneState
+from flash.core.model import Task
+from flash.core.registry import FlashRegistry, print_provider_info
+from flash.core.utilities.imports import _TEXT_AVAILABLE
+from flash.core.utilities.providers import _SENTENCE_TRANSFORMERS
+from flash.text.embedding.backbones import HUGGINGFACE_BACKBONES
+from flash.text.ort_callback import ORTCallback
+
+if _TEXT_AVAILABLE:
+    from sentence_transformers.models import Pooling
+
+    Pooling = print_provider_info("Pooling", _SENTENCE_TRANSFORMERS, Pooling)
+
+logger = logging.getLogger(__name__)
+
+
+class SentenceEmbedder(Task):
+    """The ``SentenceEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and
+    validation. For more details, see `embeddings`.
+
+    You can change the backbone to any question answering model from `UKPLab/sentence-transformers
+    <https://github.com/UKPLab/sentence-transformers>`_ using the ``backbone``
+    argument.
+
+    Args:
+        backbone: backbone model to use for the task.
+        enable_ort: Enable Torch ONNX Runtime Optimization: https://onnxruntime.ai/docs/#onnx-runtime-for-training
+    """
+
+    required_extras: str = "text"
+
+    backbones: FlashRegistry = HUGGINGFACE_BACKBONES
+
+    def __init__(
+        self,
+        backbone: str = "sentence-transformers/all-MiniLM-L6-v2",
+        tokenizer_backbone: Optional[str] = None,
+        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+        enable_ort: bool = False,
+    ):
+        os.environ["TOKENIZERS_PARALLELISM"] = "TRUE"
+        # disable HF thousand warnings
+        warnings.simplefilter("ignore")
+        # set os environ variable for multiprocesses
+        os.environ["PYTHONWARNINGS"] = "ignore"
+        super().__init__()
+
+        if tokenizer_backbone is None:
+            tokenizer_backbone = backbone
+        self.set_state(TransformersBackboneState(tokenizer_backbone, tokenizer_kwargs=tokenizer_kwargs))
+        self.model = self.backbones.get(backbone)()
+        self.pooling = Pooling(self.model.config.hidden_size)
+        self.enable_ort = enable_ort
+
+    def training_step(self, batch: Any, batch_idx: int) -> Any:
+        raise NotImplementedError("Training a `SentenceEmbedder` is not supported. Use a different text task instead.")
+
+    def validation_step(self, batch: Any, batch_idx: int) -> Any:
+        raise NotImplementedError(
+            "Validating a `SentenceEmbedder` is not supported. Use a different text task instead."
+        )
+
+    def test_step(self, batch: Any, batch_idx: int) -> Any:
+        raise NotImplementedError("Testing a `SentenceEmbedder` is not supported. Use a different text task instead.")
+
+    def forward(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """Adapted from sentence-transformers:
+
+        https://github.com/UKPLab/sentence-transformers/blob/master/sentence_transformers/models/Transformer.py#L45
+        """
+
+        trans_features = {"input_ids": batch["input_ids"], "attention_mask": batch["attention_mask"]}
+        if "token_type_ids" in batch:
+            trans_features["token_type_ids"] = batch["token_type_ids"]
+
+        output_states = self.model(**trans_features, return_dict=False)
+        output_tokens = output_states[0]
+
+        batch.update({"token_embeddings": output_tokens, "attention_mask": batch["attention_mask"]})
+
+        return self.pooling(batch)["sentence_embedding"]
+
+    def configure_callbacks(self) -> List[Callback]:
+        callbacks = super().configure_callbacks() or []
+        if self.enable_ort:
+            callbacks.append(ORTCallback())
+        return callbacks
diff --git a/flash/text/embeddings/__init__.py b/flash/text/embeddings/__init__.py
deleted file mode 100644
index 3993b0a526..0000000000
--- a/flash/text/embeddings/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from flash.text.embeddings.data import (
-    TextCSVInput,
-    TextDataFrameInput,
-    TextHuggingFaceDatasetInput,
-    TextInput,
-    TextJSONInput,
-    TextListInput,
-    TextParquetInput,
-)
-from flash.text.embeddings.model import SentenceEmbedder
diff --git a/flash/text/embeddings/backbones.py b/flash/text/embeddings/backbones.py
deleted file mode 100644
index 6fbf4036f2..0000000000
--- a/flash/text/embeddings/backbones.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from flash.core.registry import ExternalRegistry, FlashRegistry
-from flash.core.utilities.imports import _TEXT_AVAILABLE
-from flash.core.utilities.providers import _HUGGINGFACE
-
-SENTENCE_TRANSFORMERS_BACKBONE = FlashRegistry("backbones")
-
-if _TEXT_AVAILABLE:
-    from transformers import AutoConfig, AutoModel, AutoTokenizer
-
-    AUTOMODEL_BACKBONE = ExternalRegistry(
-        AutoModel.from_pretrained,
-        "backbones",
-        _HUGGINGFACE,
-    )
-    AUTOTOKENIZER_BACKBONE = ExternalRegistry(
-        AutoTokenizer.from_pretrained,
-        "backbones",
-        _HUGGINGFACE,
-    )
-    AUTOCONFIG_BACKBONE = ExternalRegistry(
-        AutoConfig.from_pretrained,
-        "backbones",
-        _HUGGINGFACE,
-    )
diff --git a/flash/text/embeddings/data.py b/flash/text/embeddings/data.py
deleted file mode 100644
index 1be078354d..0000000000
--- a/flash/text/embeddings/data.py
+++ /dev/null
@@ -1,217 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from functools import partial
-from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
-
-import torch
-from pandas.core.frame import DataFrame
-
-import flash
-from flash.core.data.auto_dataset import AutoDataset
-from flash.core.data.io.input import DataKeys, Input, LabelsState
-from flash.core.utilities.imports import _TEXT_AVAILABLE, requires
-
-if _TEXT_AVAILABLE:
-    from datasets import Dataset, load_dataset
-    from transformers import AutoTokenizer
-
-
-class TextInput(Input):
-    @requires("text")
-    def __init__(self, backbone: str, max_length: int = 128):
-        super().__init__()
-
-        self.backbone = backbone
-        self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=True)
-        self.max_length = max_length
-
-    def _tokenize_fn(
-        self,
-        ex: Union[Dict[str, str], str],
-        input: Optional[str] = None,
-    ) -> Callable:
-        """This function is used to tokenize sentences using the provided tokenizer."""
-        return self.tokenizer(ex[input], max_length=self.max_length, truncation=True, padding="max_length")
-
-    @staticmethod
-    def _transform_label(label_to_class_mapping: Dict[str, int], target: str, ex: Dict[str, Union[int, str]]):
-        ex[target] = label_to_class_mapping[ex[target]]
-        return ex
-
-    @staticmethod
-    def _multilabel_target(targets: List[str], element: Dict[str, Any]) -> Dict[str, Any]:
-        targets = [element.pop(target) for target in targets]
-        element[DataKeys.TARGET] = targets
-        return element
-
-    def _to_hf_dataset(self, data) -> Sequence[Mapping[str, Any]]:
-        """account for flash CI testing context."""
-        hf_dataset, *other = self.to_hf_dataset(data)
-
-        if flash._IS_TESTING and not torch.cuda.is_available():
-            # NOTE: must subset in this way to return a Dataset
-            hf_dataset = hf_dataset.select(range(20))
-
-        return (hf_dataset, *other)
-
-    def load_data(
-        self,
-        data: Tuple[str, Union[str, List[str]], Union[str, List[str]]],
-        dataset: Optional[Any] = None,
-    ) -> Sequence[Mapping[str, Any]]:
-        """Loads data into HuggingFace datasets.Dataset."""
-
-        hf_dataset, input, *other = self._to_hf_dataset(data)
-
-        if not self.predicting:
-            target: Union[str, List[str]] = other.pop()
-            if isinstance(target, List):
-                # multi-target
-                dataset.multi_label = True
-                hf_dataset = hf_dataset.map(partial(self._multilabel_target, target))  # NOTE: renames target column
-                dataset.num_classes = len(target)
-                self.set_state(LabelsState(target))
-            else:
-                dataset.multi_label = False
-                if self.training:
-                    labels = list(sorted(list(set(hf_dataset[target]))))
-                    dataset.num_classes = len(labels)
-                    self.set_state(LabelsState(labels))
-
-                labels = self.get_state(LabelsState)
-
-                # convert labels to ids (note: the target column get overwritten)
-                if labels is not None:
-                    labels = labels.labels
-                    label_to_class_mapping = {v: k for k, v in enumerate(labels)}
-                    hf_dataset = hf_dataset.map(partial(self._transform_label, label_to_class_mapping, target))
-
-                # rename label column
-                hf_dataset = hf_dataset.rename_column(target, DataKeys.TARGET)
-
-        # remove extra columns
-        extra_columns = set(hf_dataset.column_names) - {input, DataKeys.TARGET}
-        hf_dataset = hf_dataset.remove_columns(extra_columns)
-
-        # tokenize
-        hf_dataset = hf_dataset.map(partial(self._tokenize_fn, input=input), batched=True, remove_columns=[input])
-
-        # set format
-        hf_dataset.set_format("torch")
-
-        return hf_dataset
-
-    def predict_load_data(self, data: Any, dataset: AutoDataset):
-        return self.load_data(data, dataset)
-
-    def __getstate__(self):  # TODO: Find out why this is being pickled
-        state = self.__dict__.copy()
-        state.pop("tokenizer")
-        return state
-
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-        self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True)
-
-
-class TextCSVInput(TextInput):
-    def to_hf_dataset(self, data: Tuple[str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]:
-        file, *other = data
-        dataset_dict = load_dataset("csv", data_files={"train": str(file)})
-        return (dataset_dict["train"], *other)
-
-
-class TextJSONInput(TextInput):
-    def to_hf_dataset(self, data: Tuple[str, str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]:
-        file, *other, field = data
-        dataset_dict = load_dataset("json", data_files={"train": str(file)}, field=field)
-        return (dataset_dict["train"], *other)
-
-
-class TextDataFrameInput(TextInput):
-    def to_hf_dataset(self, data: Tuple[DataFrame, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]:
-        df, *other = data
-        hf_dataset = Dataset.from_pandas(df)
-        return (hf_dataset, *other)
-
-
-class TextParquetInput(TextInput):
-    def to_hf_dataset(self, data: Tuple[str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]:
-        file, *other = data
-        hf_dataset = Dataset.from_parquet(str(file))
-        return (hf_dataset, *other)
-
-
-class TextHuggingFaceDatasetInput(TextInput):
-    def to_hf_dataset(self, data: Tuple[str, str, str]) -> Tuple[Sequence[Mapping[str, Any]], str, str]:
-        hf_dataset, *other = data
-        return (hf_dataset, *other)
-
-
-class TextListInput(TextInput):
-    def to_hf_dataset(
-        self, data: Union[Tuple[List[str], List[str]], List[str]]
-    ) -> Tuple[Sequence[Mapping[str, Any]], Optional[List[str]]]:
-
-        if isinstance(data, tuple):
-            input_list, target_list = data
-            # NOTE: here we already deal with multilabels
-            # NOTE: here we already rename to correct column names
-            hf_dataset = Dataset.from_dict({DataKeys.INPUT: input_list, DataKeys.TARGET: target_list})
-            return hf_dataset, target_list
-
-        # predicting
-        hf_dataset = Dataset.from_dict({DataKeys.INPUT: data})
-
-        return (hf_dataset,)
-
-    def load_data(
-        self,
-        data: Tuple[List[str], Union[List[Any], List[List[Any]]]],
-        dataset: Optional[Any] = None,
-    ) -> Sequence[Mapping[str, Any]]:
-
-        hf_dataset, *other = self._to_hf_dataset(data)
-
-        if not self.predicting:
-            target_list = other.pop()
-            if isinstance(target_list[0], List):
-                # multi-target_list
-                dataset.multi_label = True
-                dataset.num_classes = len(target_list[0])
-                self.set_state(LabelsState(target_list))
-            else:
-                dataset.multi_label = False
-                if self.training:
-                    labels = list(sorted(list(set(hf_dataset[DataKeys.TARGET]))))
-                    dataset.num_classes = len(labels)
-                    self.set_state(LabelsState(labels))
-
-                labels = self.get_state(LabelsState)
-
-                # convert labels to ids
-                if labels is not None:
-                    labels = labels.labels
-                    label_to_class_mapping = {v: k for k, v in enumerate(labels)}
-                    # happens in-place and keeps the target column name
-                    hf_dataset = hf_dataset.map(partial(self._transform_label, label_to_class_mapping, DataKeys.TARGET))
-
-        # tokenize
-        hf_dataset = hf_dataset.map(partial(self._tokenize_fn, input=DataKeys.INPUT), batched=True)
-
-        # set format
-        hf_dataset = hf_dataset.remove_columns([DataKeys.INPUT])  # just leave the numerical columns
-        hf_dataset.set_format("torch")
-
-        return hf_dataset
diff --git a/flash/text/embeddings/model.py b/flash/text/embeddings/model.py
deleted file mode 100644
index f1b2165fae..0000000000
--- a/flash/text/embeddings/model.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import os
-import warnings
-from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
-
-import numpy as np
-import torch
-import torchmetrics
-from pytorch_lightning import Callback
-from pytorch_lightning.utilities import rank_zero_info
-from torch import nn, Tensor
-from torch.optim.lr_scheduler import _LRScheduler
-from torchmetrics import Metric
-from tqdm.autonotebook import trange
-
-from flash.core.data.data_source import DefaultDataKeys
-from flash.core.data.process import Serializer
-from flash.core.finetuning import FlashBaseFinetuning
-from flash.core.model import Task
-from flash.core.registry import FlashRegistry
-from flash.text.embeddings.backbones import AUTOCONFIG_BACKBONE, AUTOMODEL_BACKBONE, AUTOTOKENIZER_BACKBONE
-
-logger = logging.getLogger(__name__)
-
-
-class SentenceEmbedder(Task):
-    """The ``SentenceEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and
-    validation. For more details, see `embeddings`.
-
-    You can change the backbone to any question answering model from `UKPLab/sentence-transformers
-    <https://github.com/UKPLab/sentence-transformers>`_ using the ``backbone``
-    argument.
-
-    .. note:: When changing the backbone, make sure you pass in the same backbone to the :class:`~flash.Task` and the
-        :class:`~flash.core.data.data_module.DataModule` object! Since this is a Sentence Transformers task, make sure you
-        use a Sentence Transformers model.
-
-    Args:
-        backbone: backbone model to use for the task.
-        loss_fn: Loss function for training.
-        optimizer: Optimizer to use for training, defaults to `torch.optim.Adam`.
-        optimizer_kwargs: Additional kwargs to use when creating the optimizer (if not passed as an instance).
-        scheduler: The scheduler or scheduler class to use.
-        scheduler_kwargs: Additional kwargs to use when creating the scheduler (if not passed as an instance).
-        metrics: Metrics to compute for training and evaluation. Defauls to calculating the ROUGE metric.
-            Changing this argument currently has no effect.
-        learning_rate: Learning rate to use for training, defaults to `3e-4`
-        enable_ort: Enable Torch ONNX Runtime Optimization: https://onnxruntime.ai/docs/#onnx-runtime-for-training
-    """
-
-    required_extras: str = "text"
-
-    AutoModel_Backbones: FlashRegistry = AUTOMODEL_BACKBONE
-    AutoTokenizer_Backbones: FlashRegistry = AUTOTOKENIZER_BACKBONE
-    AutoConfig_Backbones: FlashRegistry = AUTOCONFIG_BACKBONE
-
-    def __init__(
-        self,
-        model_backbone: str = "sentence-transformers/all-MiniLM-L6-v2",
-        tokenizer_backbone: str = "sentence-transformers/all-MiniLM-L6-v2",
-        enable_ort: bool = False,
-    ):
-        os.environ["TOKENIZERS_PARALLELISM"] = "TRUE"
-        # disable HF thousand warnings
-        warnings.simplefilter("ignore")
-        # set os environ variable for multiprocesses
-        os.environ["PYTHONWARNINGS"] = "ignore"
-        super().__init__()
-
-        self.config = self.AutoConfig_Backbones.get(model_backbone)
-        self.auto_model = self.AutoModel_Backbones.get(model_backbone)
-        self.tokenzier = self.AutoTokenizer_Backbones.get(tokenizer_backbone)
-
-        if tokenizer_backbone is not None:
-            self.auto_model.config.tokenizer_class = self.tokenizer.__class__.__name__
-
-    def predict_step(
-        self,
-        sentences: Union[str, List[str]],
-        batch_size: int = 32,
-        show_progress_bar: bool = None,
-        output_value: str = "sentence_embedding",
-        convert_to_numpy: bool = True,
-        convert_to_tensor: bool = False,
-        device: str = None,
-        normalize_embeddings: bool = False,
-    ) -> Union[List[Tensor], np.ndarray, Tensor]:
-        """Computes sentence embeddings.
-
-        :param sentences: the sentences to embed
-        :param batch_size: the batch size used for the computation
-        :param show_progress_bar: Output a progress bar when encode sentences
-        :param output_value:  Default sentence_embedding, to get sentence embeddings. Can be set to token_embeddings to get wordpiece token embeddings. Set to None, to get all output values
-        :param convert_to_numpy: If true, the output is a list of numpy vectors. Else, it is a list of pytorch tensors.
-        :param convert_to_tensor: If true, you get one large tensor as return. Overwrites any setting from convert_to_numpy
-        :param device: Which torch.device to use for the computation
-        :param normalize_embeddings: If set to true, returned vectors will have length 1. In that case, the faster dot-product (util.dot_score) instead of cosine similarity can be used.
-        :return:
-           By default, a list of tensors is returned. If convert_to_tensor, a stacked tensor is returned. If convert_to_numpy, a numpy matrix is returned.
-        """
-        self.eval()
-        if show_progress_bar is None:
-            show_progress_bar = (
-                logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
-            )
-
-        if convert_to_tensor:
-            convert_to_numpy = False
-
-        if output_value != "sentence_embedding":
-            convert_to_tensor = False
-            convert_to_numpy = False
-
-        input_was_string = False
-        if isinstance(sentences, str) or not hasattr(
-            sentences, "__len__"
-        ):  # Cast an individual sentence to a list with length 1
-            sentences = [sentences]
-            input_was_string = True
-
-        if device is None:
-            device = self._target_device
-
-        self.to(device)
-
-        all_embeddings = []
-        length_sorted_idx = np.argsort([-self._text_length(sen) for sen in sentences])
-        sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
-
-        for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=not show_progress_bar):
-            sentences_batch = sentences_sorted[start_index : start_index + batch_size]
-            features = self.tokenize(sentences_batch)
-
-            with torch.no_grad():
-                out_features = self.forward(features)
-
-                if output_value == "token_embeddings":
-                    embeddings = []
-                    for token_emb, attention in zip(out_features[output_value], out_features["attention_mask"]):
-                        last_mask_id = len(attention) - 1
-                        while last_mask_id > 0 and attention[last_mask_id].item() == 0:
-                            last_mask_id -= 1
-
-                        embeddings.append(token_emb[0 : last_mask_id + 1])
-                elif output_value is None:  # Return all outputs
-                    embeddings = []
-                    for sent_idx in range(len(out_features["sentence_embedding"])):
-                        row = {name: out_features[name][sent_idx] for name in out_features}
-                        embeddings.append(row)
-                else:  # Sentence embeddings
-                    embeddings = out_features[output_value]
-                    embeddings = embeddings.detach()
-                    if normalize_embeddings:
-                        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
-
-                    # fixes for #522 and #487 to avoid oom problems on gpu with large datasets
-                    if convert_to_numpy:
-                        embeddings = embeddings.cpu()
-
-                all_embeddings.extend(embeddings)
-
-        all_embeddings = [all_embeddings[idx] for idx in np.argsort(length_sorted_idx)]
-
-        if convert_to_tensor:
-            all_embeddings = torch.stack(all_embeddings)
-        elif convert_to_numpy:
-            all_embeddings = np.asarray([emb.numpy() for emb in all_embeddings])
-
-        if input_was_string:
-            all_embeddings = all_embeddings[0]
-
-        return all_embeddings
-
-    @property
-    def backbone(self):
-        return self.model.base_model
-
-    def forward(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
-        """First call the backbone, then the model head."""
-
-        trans_features = {"input_ids": batch["input_ids"], "attention_mask": batch["attention_mask"]}
-        if "token_type_ids" in batch:
-            trans_features["token_type_ids"] = batch["token_type_ids"]
-
-        output_states = self.auto_model(**trans_features, return_dict=False)
-        output_tokens = output_states[0]
-
-        batch.update({"token_embeddings": output_tokens, "attention_mask": batch["attention_mask"]})
-
-        if self.auto_model.config.output_hidden_states:
-            all_layer_idx = 2
-            if len(output_states) < 3:  # Some models only output last_hidden_states and all_hidden_states
-                all_layer_idx = 1
-
-            hidden_states = output_states[all_layer_idx]
-            batch.update({"all_layer_embeddings": hidden_states})
-
-        return batch
diff --git a/flash_examples/text_embedder.py b/flash_examples/text_embedder.py
new file mode 100644
index 0000000000..10744c71e5
--- /dev/null
+++ b/flash_examples/text_embedder.py
@@ -0,0 +1,37 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+
+import flash
+from flash.core.data.utils import download_data
+from flash.text import SentenceEmbedder, TextClassificationData
+
+# 1. Create the DataModule
+download_data("https://pl-flash-data.s3.amazonaws.com/xsum.zip", "./data/")
+
+datamodule = TextClassificationData.from_lists(
+    predict_data=[
+        "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.",
+        "The worst movie in the history of cinema.",
+        "I come from Bulgaria where it 's almost impossible to have a tornado.",
+    ]
+)
+
+# 2. Load a previously trained SentenceEmbedder
+model = SentenceEmbedder(backbone="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
+
+# 3. Generate embeddings for the first 3 graphs
+trainer = flash.Trainer(gpus=torch.cuda.device_count())
+predictions = trainer.predict(model, datamodule=datamodule)
+print(predictions)
diff --git a/requirements/datatype_text.txt b/requirements/datatype_text.txt
index aba24a7ef5..4173bcbd78 100644
--- a/requirements/datatype_text.txt
+++ b/requirements/datatype_text.txt
@@ -3,3 +3,4 @@ filelock
 transformers>=4.5
 torchmetrics[text]>=0.5.1
 datasets>=1.8,<1.13
+sentence-transformers

From 2071c9f0f6bcc1e68c242ab04107f291cb92d22b Mon Sep 17 00:00:00 2001
From: Abhijith Neil Abraham <abhijithneilabrahampk@gmail.com>
Date: Thu, 9 Dec 2021 00:32:34 +0530
Subject: [PATCH 09/23] Create test_model.py

---
 tests/text/embedding/test_model.py | 47 ++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 tests/text/embedding/test_model.py

diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py
new file mode 100644
index 0000000000..730d89deaa
--- /dev/null
+++ b/tests/text/embedding/test_model.py
@@ -0,0 +1,47 @@
+
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import pytest
+import torch
+
+
+import flash
+from flash.text import SentenceEmbedder, TextClassificationData
+from tests.helpers.utils import  _TEXT_TESTING
+
+# ======== Mock functions ========
+
+datamodule = TextClassificationData.from_lists(
+      predict_data=[
+          "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.",
+          "The worst movie in the history of cinema.",
+          "I come from Bulgaria where it 's almost impossible to have a tornado.",
+      ]
+  )
+
+# ==============================
+
+TEST_BACKBONE = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"  # super small model for testing
+model = SentenceEmbedder(backbone=TEST_BACKBONE)
+
+@pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows")
+@pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed.")
+def test_predict(tmpdir):
+    trainer = flash.Trainer(gpus=torch.cuda.device_count())
+    predictions = trainer.predict(model, datamodule=datamodule)
+    assert[t.size() for t in predictions[0]]==[torch.Size([384]), torch.Size([384]), torch.Size([384])]
+
+

From 5477415483de6b4014e7ce9dc84c9720f159af70 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Wed, 8 Dec 2021 19:03:24 +0000
Subject: [PATCH 10/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/text/embedding/test_model.py | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py
index 730d89deaa..b3506bc521 100644
--- a/tests/text/embedding/test_model.py
+++ b/tests/text/embedding/test_model.py
@@ -1,4 +1,3 @@
-
 # Copyright The PyTorch Lightning team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,31 +16,29 @@
 import pytest
 import torch
 
-
 import flash
 from flash.text import SentenceEmbedder, TextClassificationData
-from tests.helpers.utils import  _TEXT_TESTING
+from tests.helpers.utils import _TEXT_TESTING
 
 # ======== Mock functions ========
 
 datamodule = TextClassificationData.from_lists(
-      predict_data=[
-          "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.",
-          "The worst movie in the history of cinema.",
-          "I come from Bulgaria where it 's almost impossible to have a tornado.",
-      ]
-  )
+    predict_data=[
+        "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.",
+        "The worst movie in the history of cinema.",
+        "I come from Bulgaria where it 's almost impossible to have a tornado.",
+    ]
+)
 
 # ==============================
 
 TEST_BACKBONE = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"  # super small model for testing
 model = SentenceEmbedder(backbone=TEST_BACKBONE)
 
+
 @pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows")
 @pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed.")
 def test_predict(tmpdir):
     trainer = flash.Trainer(gpus=torch.cuda.device_count())
     predictions = trainer.predict(model, datamodule=datamodule)
-    assert[t.size() for t in predictions[0]]==[torch.Size([384]), torch.Size([384]), torch.Size([384])]
-
-
+    assert [t.size() for t in predictions[0]] == [torch.Size([384]), torch.Size([384]), torch.Size([384])]

From 8db110d7797f8692dd3415b7dcd2214f8df8cb46 Mon Sep 17 00:00:00 2001
From: Abhijith Neil Abraham <abhijithneilabrahampk@gmail.com>
Date: Thu, 9 Dec 2021 00:47:27 +0530
Subject: [PATCH 11/23] __init__ for embedding

---
 tests/text/embedding/__init__.py   | 0
 tests/text/embedding/test_model.py | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 tests/text/embedding/__init__.py

diff --git a/tests/text/embedding/__init__.py b/tests/text/embedding/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py
index b3506bc521..806b2a0447 100644
--- a/tests/text/embedding/test_model.py
+++ b/tests/text/embedding/test_model.py
@@ -20,7 +20,7 @@
 from flash.text import SentenceEmbedder, TextClassificationData
 from tests.helpers.utils import _TEXT_TESTING
 
-# ======== Mock functions ========
+# ======== Mock data ========
 
 datamodule = TextClassificationData.from_lists(
     predict_data=[

From a6bfc9f3431b8cc9305ce24bb591fa84f3efc352 Mon Sep 17 00:00:00 2001
From: Abhijith Neil Abraham <abhijithneilabrahampk@gmail.com>
Date: Thu, 9 Dec 2021 01:29:08 +0530
Subject: [PATCH 12/23] remove download_data()

---
 flash_examples/text_embedder.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/flash_examples/text_embedder.py b/flash_examples/text_embedder.py
index 10744c71e5..9aa0eabbdc 100644
--- a/flash_examples/text_embedder.py
+++ b/flash_examples/text_embedder.py
@@ -18,8 +18,6 @@
 from flash.text import SentenceEmbedder, TextClassificationData
 
 # 1. Create the DataModule
-download_data("https://pl-flash-data.s3.amazonaws.com/xsum.zip", "./data/")
-
 datamodule = TextClassificationData.from_lists(
     predict_data=[
         "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.",

From 21305d6a9622a85acae9041b1acd6bb2263d3fc9 Mon Sep 17 00:00:00 2001
From: Abhijith Neil Abraham <abhijithneilabrahampk@gmail.com>
Date: Thu, 9 Dec 2021 19:18:35 +0530
Subject: [PATCH 13/23] lower size model for text embededer examples and test

---
 flash_examples/text_embedder.py    | 3 ++-
 tests/text/embedding/test_model.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/flash_examples/text_embedder.py b/flash_examples/text_embedder.py
index 9aa0eabbdc..b6e99c0742 100644
--- a/flash_examples/text_embedder.py
+++ b/flash_examples/text_embedder.py
@@ -27,9 +27,10 @@
 )
 
 # 2. Load a previously trained SentenceEmbedder
-model = SentenceEmbedder(backbone="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
+model = SentenceEmbedder(backbone="sentence-transformers/all-MiniLM-L6-v2")
 
 # 3. Generate embeddings for the first 3 graphs
 trainer = flash.Trainer(gpus=torch.cuda.device_count())
 predictions = trainer.predict(model, datamodule=datamodule)
 print(predictions)
+
diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py
index 806b2a0447..c1c433cf03 100644
--- a/tests/text/embedding/test_model.py
+++ b/tests/text/embedding/test_model.py
@@ -32,7 +32,7 @@
 
 # ==============================
 
-TEST_BACKBONE = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"  # super small model for testing
+TEST_BACKBONE = "sentence-transformers/all-MiniLM-L6-v2"  # super small model for testing
 model = SentenceEmbedder(backbone=TEST_BACKBONE)
 
 

From 5d1b4c66ef42b04a0ea129546f6715f36a7853f4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 9 Dec 2021 13:49:18 +0000
Subject: [PATCH 14/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 flash_examples/text_embedder.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/flash_examples/text_embedder.py b/flash_examples/text_embedder.py
index b6e99c0742..1a19bd646c 100644
--- a/flash_examples/text_embedder.py
+++ b/flash_examples/text_embedder.py
@@ -33,4 +33,3 @@
 trainer = flash.Trainer(gpus=torch.cuda.device_count())
 predictions = trainer.predict(model, datamodule=datamodule)
 print(predictions)
-

From 9570522e89e415851cbd32e2207f8ad1c25597b1 Mon Sep 17 00:00:00 2001
From: Abhijith Neil Abraham <abhijithneilabrahampk@gmail.com>
Date: Thu, 9 Dec 2021 19:28:00 +0530
Subject: [PATCH 15/23] text embedder example entry to CI

---
 requirements.txt               | 1 -
 tests/examples/test_scripts.py | 4 ++++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 333125ff76..0920edcb4c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,4 +7,3 @@ pyDeprecate
 pandas>=1.1.0
 jsonargparse[signatures]>=3.17.0
 click>=7.1.2
-tqdm
diff --git a/tests/examples/test_scripts.py b/tests/examples/test_scripts.py
index 033ee35b3d..6ee35fef73 100644
--- a/tests/examples/test_scripts.py
+++ b/tests/examples/test_scripts.py
@@ -80,6 +80,10 @@
             "text_classification.py",
             marks=pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed"),
         ),
+        pytest.param(
+            "text_embedder.py",
+            marks=pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed"),
+        ),
         # pytest.param(
         #     "text_classification_multi_label.py",
         #     marks=pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed")

From bb98d77986286d68c90a917885a0e1e3311796b6 Mon Sep 17 00:00:00 2001
From: Abhijith Neil Abraham <abhijithneilabrahampk@gmail.com>
Date: Thu, 9 Dec 2021 19:35:00 +0530
Subject: [PATCH 16/23] change `SentenceEmbedder` to `TextEmbedder`

---
 flash/text/__init__.py             |  2 +-
 flash/text/embedding/__init__.py   |  2 +-
 flash/text/embedding/model.py      | 10 +++++-----
 flash_examples/text_embedder.py    |  6 +++---
 tests/text/embedding/test_model.py |  4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/flash/text/__init__.py b/flash/text/__init__.py
index bf87427c9f..63400dcd9c 100644
--- a/flash/text/__init__.py
+++ b/flash/text/__init__.py
@@ -1,5 +1,5 @@
 from flash.text.classification import TextClassificationData, TextClassifier  # noqa: F401
-from flash.text.embedding import SentenceEmbedder  # noqa: F401
+from flash.text.embedding import TextEmbedder  # noqa: F401
 from flash.text.question_answering import QuestionAnsweringData, QuestionAnsweringTask  # noqa: F401
 from flash.text.seq2seq import (  # noqa: F401
     Seq2SeqData,
diff --git a/flash/text/embedding/__init__.py b/flash/text/embedding/__init__.py
index 42bc49dd26..ed171439f7 100644
--- a/flash/text/embedding/__init__.py
+++ b/flash/text/embedding/__init__.py
@@ -1 +1 @@
-from flash.text.embedding.model import SentenceEmbedder  # noqa: F401
+from flash.text.embedding.model import TextEmbedder  # noqa: F401
diff --git a/flash/text/embedding/model.py b/flash/text/embedding/model.py
index c37a98d8b0..fdec7b5f77 100644
--- a/flash/text/embedding/model.py
+++ b/flash/text/embedding/model.py
@@ -35,8 +35,8 @@
 logger = logging.getLogger(__name__)
 
 
-class SentenceEmbedder(Task):
-    """The ``SentenceEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and
+class TextEmbedder(Task):
+    """The ``TextEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and
     validation. For more details, see `embeddings`.
 
     You can change the backbone to any question answering model from `UKPLab/sentence-transformers
@@ -74,15 +74,15 @@ def __init__(
         self.enable_ort = enable_ort
 
     def training_step(self, batch: Any, batch_idx: int) -> Any:
-        raise NotImplementedError("Training a `SentenceEmbedder` is not supported. Use a different text task instead.")
+        raise NotImplementedError("Training a `TextEmbedder` is not supported. Use a different text task instead.")
 
     def validation_step(self, batch: Any, batch_idx: int) -> Any:
         raise NotImplementedError(
-            "Validating a `SentenceEmbedder` is not supported. Use a different text task instead."
+            "Validating a `TextEmbedder` is not supported. Use a different text task instead."
         )
 
     def test_step(self, batch: Any, batch_idx: int) -> Any:
-        raise NotImplementedError("Testing a `SentenceEmbedder` is not supported. Use a different text task instead.")
+        raise NotImplementedError("Testing a `TextEmbedder` is not supported. Use a different text task instead.")
 
     def forward(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
         """Adapted from sentence-transformers:
diff --git a/flash_examples/text_embedder.py b/flash_examples/text_embedder.py
index 1a19bd646c..8b69e2f262 100644
--- a/flash_examples/text_embedder.py
+++ b/flash_examples/text_embedder.py
@@ -15,7 +15,7 @@
 
 import flash
 from flash.core.data.utils import download_data
-from flash.text import SentenceEmbedder, TextClassificationData
+from flash.text import TextEmbedder, TextClassificationData
 
 # 1. Create the DataModule
 datamodule = TextClassificationData.from_lists(
@@ -26,8 +26,8 @@
     ]
 )
 
-# 2. Load a previously trained SentenceEmbedder
-model = SentenceEmbedder(backbone="sentence-transformers/all-MiniLM-L6-v2")
+# 2. Load a previously trained TextEmbedder
+model = TextEmbedder(backbone="sentence-transformers/all-MiniLM-L6-v2")
 
 # 3. Generate embeddings for the first 3 graphs
 trainer = flash.Trainer(gpus=torch.cuda.device_count())
diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py
index c1c433cf03..ad09fb9544 100644
--- a/tests/text/embedding/test_model.py
+++ b/tests/text/embedding/test_model.py
@@ -17,7 +17,7 @@
 import torch
 
 import flash
-from flash.text import SentenceEmbedder, TextClassificationData
+from flash.text import TextEmbedder, TextClassificationData
 from tests.helpers.utils import _TEXT_TESTING
 
 # ======== Mock data ========
@@ -33,7 +33,7 @@
 # ==============================
 
 TEST_BACKBONE = "sentence-transformers/all-MiniLM-L6-v2"  # super small model for testing
-model = SentenceEmbedder(backbone=TEST_BACKBONE)
+model = TextEmbedder(backbone=TEST_BACKBONE)
 
 
 @pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows")

From 923e6ec513884d825ba85ecacbf32fa699630b26 Mon Sep 17 00:00:00 2001
From: Abhijith Neil Abraham <abhijithneilabrahampk@gmail.com>
Date: Thu, 9 Dec 2021 19:36:01 +0530
Subject: [PATCH 17/23] remove `download_data`  import

---
 flash_examples/text_embedder.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/flash_examples/text_embedder.py b/flash_examples/text_embedder.py
index 8b69e2f262..5eeae9a0f0 100644
--- a/flash_examples/text_embedder.py
+++ b/flash_examples/text_embedder.py
@@ -14,7 +14,6 @@
 import torch
 
 import flash
-from flash.core.data.utils import download_data
 from flash.text import TextEmbedder, TextClassificationData
 
 # 1. Create the DataModule

From 8c90286be68e0394de847398818788031e53a9ac Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 9 Dec 2021 14:07:10 +0000
Subject: [PATCH 18/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 flash/text/embedding/model.py      | 8 +++-----
 flash_examples/text_embedder.py    | 2 +-
 tests/text/embedding/test_model.py | 2 +-
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/flash/text/embedding/model.py b/flash/text/embedding/model.py
index fdec7b5f77..2fae923403 100644
--- a/flash/text/embedding/model.py
+++ b/flash/text/embedding/model.py
@@ -36,8 +36,8 @@
 
 
 class TextEmbedder(Task):
-    """The ``TextEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and
-    validation. For more details, see `embeddings`.
+    """The ``TextEmbedder`` is a :class:`~flash.Task` for generating sentence embeddings, training and validation.
+    For more details, see `embeddings`.
 
     You can change the backbone to any question answering model from `UKPLab/sentence-transformers
     <https://github.com/UKPLab/sentence-transformers>`_ using the ``backbone``
@@ -77,9 +77,7 @@ def training_step(self, batch: Any, batch_idx: int) -> Any:
         raise NotImplementedError("Training a `TextEmbedder` is not supported. Use a different text task instead.")
 
     def validation_step(self, batch: Any, batch_idx: int) -> Any:
-        raise NotImplementedError(
-            "Validating a `TextEmbedder` is not supported. Use a different text task instead."
-        )
+        raise NotImplementedError("Validating a `TextEmbedder` is not supported. Use a different text task instead.")
 
     def test_step(self, batch: Any, batch_idx: int) -> Any:
         raise NotImplementedError("Testing a `TextEmbedder` is not supported. Use a different text task instead.")
diff --git a/flash_examples/text_embedder.py b/flash_examples/text_embedder.py
index 5eeae9a0f0..f613f0def8 100644
--- a/flash_examples/text_embedder.py
+++ b/flash_examples/text_embedder.py
@@ -14,7 +14,7 @@
 import torch
 
 import flash
-from flash.text import TextEmbedder, TextClassificationData
+from flash.text import TextClassificationData, TextEmbedder
 
 # 1. Create the DataModule
 datamodule = TextClassificationData.from_lists(
diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py
index ad09fb9544..64f4404e0b 100644
--- a/tests/text/embedding/test_model.py
+++ b/tests/text/embedding/test_model.py
@@ -17,7 +17,7 @@
 import torch
 
 import flash
-from flash.text import TextEmbedder, TextClassificationData
+from flash.text import TextClassificationData, TextEmbedder
 from tests.helpers.utils import _TEXT_TESTING
 
 # ======== Mock data ========

From 20233f2907f22918732184e25869bac5338ca714 Mon Sep 17 00:00:00 2001
From: Abhijith Neil Abraham <abhijithneilabrahampk@gmail.com>
Date: Thu, 9 Dec 2021 19:46:42 +0530
Subject: [PATCH 19/23] fix bug - test_model.py

---
 tests/text/embedding/test_model.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py
index ad09fb9544..f293eb184b 100644
--- a/tests/text/embedding/test_model.py
+++ b/tests/text/embedding/test_model.py
@@ -22,14 +22,11 @@
 
 # ======== Mock data ========
 
-datamodule = TextClassificationData.from_lists(
-    predict_data=[
-        "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.",
-        "The worst movie in the history of cinema.",
-        "I come from Bulgaria where it 's almost impossible to have a tornado.",
-    ]
-)
-
+predict_data=[
+     "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.",
+     "The worst movie in the history of cinema.",
+     "I come from Bulgaria where it 's almost impossible to have a tornado.",
+ ]
 # ==============================
 
 TEST_BACKBONE = "sentence-transformers/all-MiniLM-L6-v2"  # super small model for testing
@@ -39,6 +36,9 @@
 @pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows")
 @pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed.")
 def test_predict(tmpdir):
+    datamodule = TextClassificationData.from_lists(
+        predict_data=predict_data
+    )
     trainer = flash.Trainer(gpus=torch.cuda.device_count())
     predictions = trainer.predict(model, datamodule=datamodule)
     assert [t.size() for t in predictions[0]] == [torch.Size([384]), torch.Size([384]), torch.Size([384])]

From 57aa577d258f8afb76b85ba01b9682fa2850fe4c Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 9 Dec 2021 14:17:27 +0000
Subject: [PATCH 20/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/text/embedding/test_model.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py
index 468e5c59f8..389f384a12 100644
--- a/tests/text/embedding/test_model.py
+++ b/tests/text/embedding/test_model.py
@@ -22,11 +22,11 @@
 
 # ======== Mock data ========
 
-predict_data=[
-     "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.",
-     "The worst movie in the history of cinema.",
-     "I come from Bulgaria where it 's almost impossible to have a tornado.",
- ]
+predict_data = [
+    "Turgid dialogue, feeble characterization - Harvey Keitel a judge?.",
+    "The worst movie in the history of cinema.",
+    "I come from Bulgaria where it 's almost impossible to have a tornado.",
+]
 # ==============================
 
 TEST_BACKBONE = "sentence-transformers/all-MiniLM-L6-v2"  # super small model for testing
@@ -36,9 +36,7 @@
 @pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows")
 @pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed.")
 def test_predict(tmpdir):
-    datamodule = TextClassificationData.from_lists(
-        predict_data=predict_data
-    )
+    datamodule = TextClassificationData.from_lists(predict_data=predict_data)
     trainer = flash.Trainer(gpus=torch.cuda.device_count())
     predictions = trainer.predict(model, datamodule=datamodule)
     assert [t.size() for t in predictions[0]] == [torch.Size([384]), torch.Size([384]), torch.Size([384])]

From fdbb2de77112ec40858fc66e68c0ca7c6063923e Mon Sep 17 00:00:00 2001
From: Abhijith Neil Abraham <abhijithneilabrahampk@gmail.com>
Date: Thu, 9 Dec 2021 19:54:15 +0530
Subject: [PATCH 21/23] Update test_model.py

---
 tests/text/embedding/test_model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py
index 468e5c59f8..758057c5a6 100644
--- a/tests/text/embedding/test_model.py
+++ b/tests/text/embedding/test_model.py
@@ -30,7 +30,6 @@
 # ==============================
 
 TEST_BACKBONE = "sentence-transformers/all-MiniLM-L6-v2"  # super small model for testing
-model = TextEmbedder(backbone=TEST_BACKBONE)
 
 
 @pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows")
@@ -39,6 +38,8 @@ def test_predict(tmpdir):
     datamodule = TextClassificationData.from_lists(
         predict_data=predict_data
     )
+    model = TextEmbedder(backbone=TEST_BACKBONE)
+
     trainer = flash.Trainer(gpus=torch.cuda.device_count())
     predictions = trainer.predict(model, datamodule=datamodule)
     assert [t.size() for t in predictions[0]] == [torch.Size([384]), torch.Size([384]), torch.Size([384])]

From 14a5e27ee12cc7101be4b39525f4297224677bd9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 9 Dec 2021 14:26:00 +0000
Subject: [PATCH 22/23] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 tests/text/embedding/test_model.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tests/text/embedding/test_model.py b/tests/text/embedding/test_model.py
index 827cbb2fe9..0a712b3b3a 100644
--- a/tests/text/embedding/test_model.py
+++ b/tests/text/embedding/test_model.py
@@ -35,9 +35,7 @@
 @pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows")
 @pytest.mark.skipif(not _TEXT_TESTING, reason="text libraries aren't installed.")
 def test_predict(tmpdir):
-    datamodule = TextClassificationData.from_lists(
-        predict_data=predict_data
-    )
+    datamodule = TextClassificationData.from_lists(predict_data=predict_data)
     model = TextEmbedder(backbone=TEST_BACKBONE)
 
     trainer = flash.Trainer(gpus=torch.cuda.device_count())

From 3d14659926e4afb9e09aceba86b6dcb3a4205892 Mon Sep 17 00:00:00 2001
From: Abhijith Neil Abraham <abhijithneilabrahampk@gmail.com>
Date: Thu, 9 Dec 2021 20:19:50 +0530
Subject: [PATCH 23/23] Update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 94aff3f61d..89d200eeff 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ## [Unreleased] - YYYY-DD-MM
 
 ### Added
+- Added `TextEmbedder` task ([#996](https://github.com/PyTorchLightning/lightning-flash/pull/996))
 
 - Added predict_kwargs in `ObjectDetector`, `InstanceSegmentation`, `KeypointDetector` ([#990](https://github.com/PyTorchLightning/lightning-flash/pull/990))