From 1054949b5702aeb2e59da59e2f2b4bb9ee4181b0 Mon Sep 17 00:00:00 2001 From: Ethan Harris Date: Fri, 28 May 2021 19:09:30 +0100 Subject: [PATCH] Fix translation (#332) * Fix translation * Fix example * Fix * Add metric test * Fixes * Fix pickle bug for classification * Update CHANGELOG.md * Add tests --- CHANGELOG.md | 6 ++ flash/text/classification/data.py | 46 ++++++++++ flash/text/seq2seq/core/data.py | 90 ++++++++++++++++++- flash/text/seq2seq/summarization/data.py | 41 +-------- flash/text/seq2seq/translation/data.py | 3 +- flash/text/seq2seq/translation/metric.py | 4 +- flash/text/seq2seq/translation/model.py | 10 +-- flash_examples/finetuning/translation.py | 2 +- tests/text/classification/test_data.py | 39 ++++++++ tests/text/seq2seq/core/test_data.py | 74 +++++++++++++++ .../{ => seq2seq}/summarization/__init__.py | 0 .../{ => seq2seq}/summarization/test_data.py | 0 .../{ => seq2seq}/summarization/test_model.py | 0 .../{ => seq2seq}/translation/__init__.py | 0 .../{ => seq2seq}/translation/test_data.py | 0 tests/text/seq2seq/translation/test_metric.py | 25 ++++++ .../{ => seq2seq}/translation/test_model.py | 0 17 files changed, 289 insertions(+), 51 deletions(-) create mode 100644 tests/text/seq2seq/core/test_data.py rename tests/text/{ => seq2seq}/summarization/__init__.py (100%) rename tests/text/{ => seq2seq}/summarization/test_data.py (100%) rename tests/text/{ => seq2seq}/summarization/test_model.py (100%) rename tests/text/{ => seq2seq}/translation/__init__.py (100%) rename tests/text/{ => seq2seq}/translation/test_data.py (100%) create mode 100644 tests/text/seq2seq/translation/test_metric.py rename tests/text/{ => seq2seq}/translation/test_model.py (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 58dfd5cb70..99c916b7a0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `flash.Trainer.add_argparse_args` not adding any arguments ([#343](https://github.com/PyTorchLightning/lightning-flash/pull/343)) +- Fixed a bug where the translation task wasn't decoding tokens properly ([#332](https://github.com/PyTorchLightning/lightning-flash/pull/332)) + + +- Fixed a bug where huggingface tokenizers were sometimes being pickled ([#332](https://github.com/PyTorchLightning/lightning-flash/pull/332)) + + ## [0.3.0] - 2021-05-20 ### Added diff --git a/flash/text/classification/data.py b/flash/text/classification/data.py index d0acf1bda7..249af45105 100644 --- a/flash/text/classification/data.py +++ b/flash/text/classification/data.py @@ -38,6 +38,7 @@ def __init__(self, backbone: str, max_length: int = 128): if not _TEXT_AVAILABLE: raise ModuleNotFoundError("Please, pip install -e '.[text]'") + self.backbone = backbone self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=True) self.max_length = max_length @@ -55,6 +56,15 @@ def _transform_label(self, label_to_class_mapping: Dict[str, int], target: str, ex[target] = label_to_class_mapping[ex[target]] return ex + def __getstate__(self): # TODO: Find out why this is being pickled + state = self.__dict__.copy() + state.pop("tokenizer") + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) + class TextFileDataSource(TextDataSource): @@ -115,18 +125,45 @@ def load_data( def predict_load_data(self, data: Any, dataset: AutoDataset): return self.load_data(data, dataset, columns=["input_ids", "attention_mask"]) + def __getstate__(self): # TODO: Find out why this is being pickled + state = self.__dict__.copy() + state.pop("tokenizer") + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) + class TextCSVDataSource(TextFileDataSource): def __init__(self, backbone: str, max_length: int = 128): super().__init__("csv", backbone, max_length=max_length) + def __getstate__(self): # TODO: Find out why this is being pickled + state = self.__dict__.copy() + state.pop("tokenizer") + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) + class TextJSONDataSource(TextFileDataSource): def __init__(self, backbone: str, max_length: int = 128): super().__init__("json", backbone, max_length=max_length) + def __getstate__(self): # TODO: Find out why this is being pickled + state = self.__dict__.copy() + state.pop("tokenizer") + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) + class TextSentencesDataSource(TextDataSource): @@ -143,6 +180,15 @@ def load_data( data = [data] return [self._tokenize_fn(s, ) for s in data] + def __getstate__(self): # TODO: Find out why this is being pickled + state = self.__dict__.copy() + state.pop("tokenizer") + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) + class TextClassificationPreprocess(Preprocess): diff --git a/flash/text/seq2seq/core/data.py b/flash/text/seq2seq/core/data.py index 08a3151a50..28deb8d987 100644 --- a/flash/text/seq2seq/core/data.py +++ b/flash/text/seq2seq/core/data.py @@ -21,7 +21,7 @@ import flash from flash.core.data.data_module import DataModule from flash.core.data.data_source import DataSource, DefaultDataSources -from flash.core.data.process import Preprocess +from flash.core.data.process import Postprocess, Preprocess from flash.core.data.properties import ProcessState from flash.core.utilities.imports import _TEXT_AVAILABLE @@ -45,7 +45,8 @@ def __init__( if not _TEXT_AVAILABLE: raise ModuleNotFoundError("Please, pip install -e '.[text]'") - self.tokenizer = AutoTokenizer.from_pretrained(backbone, use_fast=True) + self.backbone = backbone + self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) self.max_source_length = max_source_length self.max_target_length = max_target_length self.padding = padding @@ -71,6 +72,15 @@ def _tokenize_fn( padding=self.padding, ) + def __getstate__(self): # TODO: Find out why this is being pickled + state = self.__dict__.copy() + state.pop("tokenizer") + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) + class Seq2SeqFileDataSource(Seq2SeqDataSource): @@ -112,6 +122,15 @@ def load_data( def predict_load_data(self, data: Any) -> Union['datasets.Dataset', List[Dict[str, torch.Tensor]]]: return self.load_data(data, columns=["input_ids", "attention_mask"]) + def __getstate__(self): # TODO: Find out why this is being pickled + state = self.__dict__.copy() + state.pop("tokenizer") + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) + class Seq2SeqCSVDataSource(Seq2SeqFileDataSource): @@ -130,6 +149,15 @@ def __init__( padding=padding, ) + def __getstate__(self): # TODO: Find out why this is being pickled + state = self.__dict__.copy() + state.pop("tokenizer") + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) + class Seq2SeqJSONDataSource(Seq2SeqFileDataSource): @@ -148,6 +176,15 @@ def __init__( padding=padding, ) + def __getstate__(self): # TODO: Find out why this is being pickled + state = self.__dict__.copy() + state.pop("tokenizer") + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) + class Seq2SeqSentencesDataSource(Seq2SeqDataSource): @@ -161,6 +198,15 @@ def load_data( data = [data] return [self._tokenize_fn(s) for s in data] + def __getstate__(self): # TODO: Find out why this is being pickled + state = self.__dict__.copy() + state.pop("tokenizer") + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) + @dataclass(unsafe_hash=True, frozen=True) class Seq2SeqBackboneState(ProcessState): @@ -240,7 +286,47 @@ def collate(self, samples: Any) -> Tensor: return default_data_collator(samples) +class Seq2SeqPostprocess(Postprocess): + + def __init__(self): + super().__init__() + + if not _TEXT_AVAILABLE: + raise ModuleNotFoundError("Please, pip install -e '.[text]'") + + self._backbone = None + self._tokenizer = None + + @property + def backbone(self): + backbone_state = self.get_state(Seq2SeqBackboneState) + if backbone_state is not None: + return backbone_state.backbone + + @property + def tokenizer(self): + if self.backbone is not None and self.backbone != self._backbone: + self._tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) + self._backbone = self.backbone + return self._tokenizer + + def uncollate(self, generated_tokens: Any) -> Any: + pred_str = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) + pred_str = [str.strip(s) for s in pred_str] + return pred_str + + def __getstate__(self): # TODO: Find out why this is being pickled + state = self.__dict__.copy() + state.pop("_tokenizer") + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self._tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) + + class Seq2SeqData(DataModule): """Data module for Seq2Seq tasks.""" preprocess_cls = Seq2SeqPreprocess + postprocess_cls = Seq2SeqPostprocess diff --git a/flash/text/seq2seq/summarization/data.py b/flash/text/seq2seq/summarization/data.py index fd6823a0f4..31f81f9ffa 100644 --- a/flash/text/seq2seq/summarization/data.py +++ b/flash/text/seq2seq/summarization/data.py @@ -11,47 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any - -from flash.core.data.process import Postprocess -from flash.core.utilities.imports import _TEXT_AVAILABLE -from flash.text.seq2seq.core.data import Seq2SeqBackboneState, Seq2SeqData, Seq2SeqPreprocess - -if _TEXT_AVAILABLE: - from transformers import AutoTokenizer - - -class SummarizationPostprocess(Postprocess): - - def __init__(self): - super().__init__() - - if not _TEXT_AVAILABLE: - raise ModuleNotFoundError("Please, pip install -e '.[text]'") - - self._backbone = None - self._tokenizer = None - - @property - def backbone(self): - backbone_state = self.get_state(Seq2SeqBackboneState) - if backbone_state is not None: - return backbone_state.backbone - - @property - def tokenizer(self): - if self.backbone is not None and self.backbone != self._backbone: - self._tokenizer = AutoTokenizer.from_pretrained(self.backbone, use_fast=True) - self._backbone = self.backbone - return self._tokenizer - - def uncollate(self, generated_tokens: Any) -> Any: - pred_str = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) - pred_str = [str.strip(s) for s in pred_str] - return pred_str +from flash.text.seq2seq.core.data import Seq2SeqData, Seq2SeqPostprocess, Seq2SeqPreprocess class SummarizationData(Seq2SeqData): preprocess_cls = Seq2SeqPreprocess - postprocess_cls = SummarizationPostprocess + postprocess_cls = Seq2SeqPostprocess diff --git a/flash/text/seq2seq/translation/data.py b/flash/text/seq2seq/translation/data.py index 057ce41869..0b9e7a3ce7 100644 --- a/flash/text/seq2seq/translation/data.py +++ b/flash/text/seq2seq/translation/data.py @@ -13,7 +13,7 @@ # limitations under the License. from typing import Callable, Dict, Optional, Union -from flash.text.seq2seq.core.data import Seq2SeqData, Seq2SeqPreprocess +from flash.text.seq2seq.core.data import Seq2SeqData, Seq2SeqPostprocess, Seq2SeqPreprocess class TranslationPreprocess(Seq2SeqPreprocess): @@ -45,3 +45,4 @@ class TranslationData(Seq2SeqData): """Data module for Translation tasks.""" preprocess_cls = TranslationPreprocess + postprocess_cls = Seq2SeqPostprocess diff --git a/flash/text/seq2seq/translation/metric.py b/flash/text/seq2seq/translation/metric.py index 615b34ddf3..bd3e4fe872 100644 --- a/flash/text/seq2seq/translation/metric.py +++ b/flash/text/seq2seq/translation/metric.py @@ -81,9 +81,7 @@ def compute(self): return tensor(0.0, device=self.r.device) if self.smooth: - precision_scores = torch.add(self.numerator, torch.ones( - self.n_gram - )) / torch.add(self.denominator, torch.ones(self.n_gram)) + precision_scores = (self.numerator + 1.0) / (self.denominator + 1.0) else: precision_scores = self.numerator / self.denominator diff --git a/flash/text/seq2seq/translation/model.py b/flash/text/seq2seq/translation/model.py index da943ec1f4..e9734cfa8f 100644 --- a/flash/text/seq2seq/translation/model.py +++ b/flash/text/seq2seq/translation/model.py @@ -28,7 +28,7 @@ class TranslationTask(Seq2SeqTask): loss_fn: Loss function for training. optimizer: Optimizer to use for training, defaults to `torch.optim.Adam`. metrics: Metrics to compute for training and evaluation. - learning_rate: Learning rate to use for training, defaults to `3e-4` + learning_rate: Learning rate to use for training, defaults to `1e-5` val_target_max_length: Maximum length of targets in validation. Defaults to `128` num_beams: Number of beams to use in validation when generating predictions. Defaults to `4` n_gram: Maximum n_grams to use in metric calculation. Defaults to `4` @@ -41,11 +41,11 @@ def __init__( loss_fn: Optional[Union[Callable, Mapping, Sequence]] = None, optimizer: Type[torch.optim.Optimizer] = torch.optim.Adam, metrics: Union[pl.metrics.Metric, Mapping, Sequence, None] = None, - learning_rate: float = 3e-4, + learning_rate: float = 1e-5, val_target_max_length: Optional[int] = 128, num_beams: Optional[int] = 4, n_gram: bool = 4, - smooth: bool = False, + smooth: bool = True, ): self.save_hyperparameters() super().__init__( @@ -70,11 +70,11 @@ def compute_metrics(self, generated_tokens, batch, prefix): tgt_lns = self.tokenize_labels(batch["labels"]) # wrap targets in list as score expects a list of potential references tgt_lns = [[reference] for reference in tgt_lns] - result = self.bleu(generated_tokens, tgt_lns) + result = self.bleu(self._postprocess.uncollate(generated_tokens), tgt_lns) self.log(f"{prefix}_bleu_score", result, on_step=False, on_epoch=True, prog_bar=True) def _ci_benchmark_fn(self, history: List[Dict[str, Any]]): """ This function is used only for debugging usage with CI """ - # assert history[-1]["val_bleu_score"] + assert history[-1]["val_bleu_score"] > 0.6 diff --git a/flash_examples/finetuning/translation.py b/flash_examples/finetuning/translation.py index 5965b3d9fc..69b789fa7a 100644 --- a/flash_examples/finetuning/translation.py +++ b/flash_examples/finetuning/translation.py @@ -20,7 +20,7 @@ # 1. Download the data download_data("https://pl-flash-data.s3.amazonaws.com/wmt_en_ro.zip", "data/") -backbone = "t5-small" +backbone = "Helsinki-NLP/opus-mt-en-ro" # 2. Load the data datamodule = TranslationData.from_csv( diff --git a/tests/text/classification/test_data.py b/tests/text/classification/test_data.py index 19a4186389..0564355cd6 100644 --- a/tests/text/classification/test_data.py +++ b/tests/text/classification/test_data.py @@ -18,6 +18,16 @@ from flash.core.utilities.imports import _TEXT_AVAILABLE from flash.text import TextClassificationData +from flash.text.classification.data import ( + TextCSVDataSource, + TextDataSource, + TextFileDataSource, + TextJSONDataSource, + TextSentencesDataSource, +) + +if _TEXT_AVAILABLE: + from transformers.tokenization_utils_base import PreTrainedTokenizerBase TEST_BACKBONE = "prajjwal1/bert-tiny" # super small model for testing @@ -92,3 +102,32 @@ def test_from_json(tmpdir): def test_text_module_not_found_error(): with pytest.raises(ModuleNotFoundError, match="[text]"): TextClassificationData.from_json("sentence", "lab", backbone=TEST_BACKBONE, train_file="", batch_size=1) + + +@pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows") +@pytest.mark.skipif(not _TEXT_AVAILABLE, reason="text libraries aren't installed.") +@pytest.mark.parametrize( + "cls, kwargs", + [ + (TextDataSource, {}), + (TextFileDataSource, { + "filetype": "csv" + }), + (TextCSVDataSource, {}), + (TextJSONDataSource, {}), + (TextSentencesDataSource, {}), + ], +) +def test_tokenizer_state(cls, kwargs): + """Tests that the tokenizer is not in __getstate__""" + instance = cls(backbone="sshleifer/tiny-mbart", **kwargs) + state = instance.__getstate__() + tokenizers = [] + for name, attribute in instance.__dict__.items(): + if isinstance(attribute, PreTrainedTokenizerBase): + assert name not in state + setattr(instance, name, None) + tokenizers.append(name) + instance.__setstate__(state) + for name in tokenizers: + assert getattr(instance, name, None) is not None diff --git a/tests/text/seq2seq/core/test_data.py b/tests/text/seq2seq/core/test_data.py new file mode 100644 index 0000000000..bf63bb86fb --- /dev/null +++ b/tests/text/seq2seq/core/test_data.py @@ -0,0 +1,74 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +from pathlib import Path + +import pytest + +from flash.core.utilities.imports import _TEXT_AVAILABLE +from flash.text import TextClassificationData +from flash.text.seq2seq.core.data import ( + Seq2SeqBackboneState, + Seq2SeqCSVDataSource, + Seq2SeqDataSource, + Seq2SeqFileDataSource, + Seq2SeqJSONDataSource, + Seq2SeqPostprocess, + Seq2SeqSentencesDataSource, +) + +if _TEXT_AVAILABLE: + from transformers.tokenization_utils_base import PreTrainedTokenizerBase + + +@pytest.mark.skipif(os.name == "nt", reason="Huggingface timing out on Windows") +@pytest.mark.skipif(not _TEXT_AVAILABLE, reason="text libraries aren't installed.") +@pytest.mark.parametrize( + "cls, kwargs", + [ + (Seq2SeqDataSource, { + "backbone": "sshleifer/tiny-mbart" + }), + (Seq2SeqFileDataSource, { + "backbone": "sshleifer/tiny-mbart", + "filetype": "csv" + }), + (Seq2SeqCSVDataSource, { + "backbone": "sshleifer/tiny-mbart" + }), + (Seq2SeqJSONDataSource, { + "backbone": "sshleifer/tiny-mbart" + }), + (Seq2SeqSentencesDataSource, { + "backbone": "sshleifer/tiny-mbart" + }), + (Seq2SeqPostprocess, {}), + ], +) +def test_tokenizer_state(cls, kwargs): + """Tests that the tokenizer is not in __getstate__""" + process_state = Seq2SeqBackboneState(backbone="sshleifer/tiny-mbart") + instance = cls(**kwargs) + instance.set_state(process_state) + getattr(instance, "tokenizer", None) + state = instance.__getstate__() + tokenizers = [] + for name, attribute in instance.__dict__.items(): + if isinstance(attribute, PreTrainedTokenizerBase): + assert name not in state + setattr(instance, name, None) + tokenizers.append(name) + instance.__setstate__(state) + for name in tokenizers: + assert getattr(instance, name, None) is not None diff --git a/tests/text/summarization/__init__.py b/tests/text/seq2seq/summarization/__init__.py similarity index 100% rename from tests/text/summarization/__init__.py rename to tests/text/seq2seq/summarization/__init__.py diff --git a/tests/text/summarization/test_data.py b/tests/text/seq2seq/summarization/test_data.py similarity index 100% rename from tests/text/summarization/test_data.py rename to tests/text/seq2seq/summarization/test_data.py diff --git a/tests/text/summarization/test_model.py b/tests/text/seq2seq/summarization/test_model.py similarity index 100% rename from tests/text/summarization/test_model.py rename to tests/text/seq2seq/summarization/test_model.py diff --git a/tests/text/translation/__init__.py b/tests/text/seq2seq/translation/__init__.py similarity index 100% rename from tests/text/translation/__init__.py rename to tests/text/seq2seq/translation/__init__.py diff --git a/tests/text/translation/test_data.py b/tests/text/seq2seq/translation/test_data.py similarity index 100% rename from tests/text/translation/test_data.py rename to tests/text/seq2seq/translation/test_data.py diff --git a/tests/text/seq2seq/translation/test_metric.py b/tests/text/seq2seq/translation/test_metric.py new file mode 100644 index 0000000000..86b5784745 --- /dev/null +++ b/tests/text/seq2seq/translation/test_metric.py @@ -0,0 +1,25 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +import torch + +from flash.text.seq2seq.translation.metric import BLEUScore + + +@pytest.mark.parametrize("smooth, expected", [(False, 0.7598), (True, 0.8091)]) +def test_bleu_score(smooth, expected): + translate_corpus = ['the cat is on the mat'.split()] + reference_corpus = [['there is a cat on the mat'.split(), 'a cat is on the mat'.split()]] + metric = BLEUScore(smooth=smooth) + assert torch.allclose(metric(translate_corpus, reference_corpus), torch.tensor(expected), 1e-4) diff --git a/tests/text/translation/test_model.py b/tests/text/seq2seq/translation/test_model.py similarity index 100% rename from tests/text/translation/test_model.py rename to tests/text/seq2seq/translation/test_model.py