From 19ee3d5305d65f87c792b1f0a9cc5e5ad9963f53 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Tue, 25 Jul 2023 14:53:03 +0200 Subject: [PATCH 01/37] WIP implementation of multi-dataset NLP benchmark --- src/renate/benchmark/datasets/nlp_datasets.py | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 906cdaae..698329ca 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -1,11 +1,13 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 +import functools import logging from typing import Any, Dict, Optional import datasets import torch import transformers +from datasets import load_dataset from renate import defaults from renate.data.data_module import RenateDataModule @@ -134,3 +136,127 @@ def tokenize_fn(batch): self._val_data = _InputTargetWrapper(self._val_data, self._target_column) else: self._train_data, self._val_data = self._split_train_val_data(self._train_data) + + +class MultiTextDataModule(RenateDataModule): + """ + Inspired by the dataset used in "Episodic Memory in Lifelong Language Learning" by d’Autume et al. + it is a collection of five different datasets: AGNews, Yelp, Amazon reviews, DBPedia, Yahoo Answers. + + The output space if the union of the output space of all the datasets. + The dataset has 33 classes: 4 from AGNews, 5 from Yelp, 14 from DBPedia, 5 from Amazon reviews, + 10 from Yahoo. Amazon and Yelp have similar semantics and the classes have been merged. + + The maximum allowed training set size is 115k (the size of the smallest dataset). + The dataset is balanced across datasets by construction and each data batch contains data from + a single dataset. + + Args: + data_path: the path to the folder where the data files will be downloaded to. + tokenizer: Tokenizer to apply to the dataset. See https://huggingface.co/docs/tokenizers/ + for more information on tokenizers. + tokenizer_kwargs: Keyword arguments passed when calling the tokenizer's ``__call__`` + function. Typical options are `max_length`, `padding` and `truncation`. + See https://huggingface.co/docs/tokenizers/ + for more information on tokenizers. If `None` is passed, this defaults to + `{"padding": "max_length", max_length: 128, truncation: True}`. + train_size: the size of the data batch, must be smaller than 115k. + val_size: Fraction of the training data to be used for validation. + seed: Seed used to fix random number generation. + """ + + def __init__( + self, + data_path: str, + tokenizer: transformers.PreTrainedTokenizer, + tokenizer_kwargs: Optional[Dict[str, Any]] = None, + train_size: int = 115000, + val_size: float = defaults.VALIDATION_SIZE, + seed: int = defaults.SEED, + ): + super().__init__( + data_path=data_path, + val_size=val_size, + seed=seed, + ) + self._tokenizer = tokenizer + self._tokenizer_kwargs = tokenizer_kwargs or defaults.TOKENIZER_KWARGS + self._multi_dataset_info = { + "ag_news": ["text", "label"], + "yelp_review_full": ["text", "label"], + "amazon_reviews_multi": ["review_body", "stars"], + "dbpedia_14": ["content", "label"], + "yahoo_answers_topics": ["question_title", "topic"], + } + self._observed_labels = {} + + def prepare_data(self) -> None: + """Download dataset.""" + + for dataset in self._multi_dataset_info.keys(): + for split in ["train", "test"] + (["validation"] if self._val_size > 0 else []): + if dataset == "amazon_reviews_multi": + load_dataset(dataset, name="en", split=split, cache_dir=self._data_path) + else: + load_dataset(dataset, split=split, cache_dir=self._data_path) + + def setup(self) -> None: + """Set up train, test and val datasets.""" + + def preprocess(example, dataset_name, text_field_name, label_field_name): + return { + **self._tokenizer(example[text_field_name], **self._tokenizer_kwargs), + "label": get_label(dataset_name + str(example[label_field_name])), + } + + def get_split(split_name, dataset_name): + dataset = load_dataset(dataset_name, split=split_name, cache_dir=self._data_path) + + new_features = dataset.features.copy() + new_features["label"] = datasets.ClassLabel(num_classes=35) + + dataset = dataset.cast(new_features) + + dataset = dataset.map( + functools.partial( + preprocess, + dataset_name=dataset_name, + text_field_name=self._multi_dataset_info[dataset_name][0], + label_field_name=self._multi_dataset_info[dataset_name][1], + ), + remove_columns=list(dataset.features), + num_proc=4, + ) + + print(dataset) + print(dataset.features) + dataset.set_format(type="torch") + + return _InputTargetWrapper(dataset) + + def get_label(label: str): + if "yelp_review_full" in label: + label = label.replace("yelp_review_full", "amazon_reviews_multi") + + if len(self._observed_labels.keys()) == 0: + self._observed_labels[label] = 0 + return 0 + + elif label not in self._observed_labels.keys(): + max_val = max(self._observed_labels.values()) + self._observed_labels[label] = max_val + 1 + return max_val + 1 + + else: + return self._observed_labels[label] + + self._train_data = [] + self._test_data = [] + if self._val_size > 0: + self._val_data = [] + + for dataset in self._multi_dataset_info: + self._train_data.append(get_split("train", dataset)) + self._test_data.append(get_split("test", dataset)) + if self._val_size > 0: + self._val_data.append(get_split("validation", dataset)) From a00ed47d6e71b85016f3bff861bd552b6bcd24ba Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Tue, 25 Jul 2023 16:51:05 +0200 Subject: [PATCH 02/37] First implementation of 5 datasets NLP benchmark --- src/renate/benchmark/datasets/nlp_datasets.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 698329ca..0935f336 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -170,7 +170,7 @@ def __init__( data_path: str, tokenizer: transformers.PreTrainedTokenizer, tokenizer_kwargs: Optional[Dict[str, Any]] = None, - train_size: int = 115000, + train_size: int = 1000, val_size: float = defaults.VALIDATION_SIZE, seed: int = defaults.SEED, ): @@ -181,6 +181,7 @@ def __init__( ) self._tokenizer = tokenizer self._tokenizer_kwargs = tokenizer_kwargs or defaults.TOKENIZER_KWARGS + self._train_size = train_size self._multi_dataset_info = { "ag_news": ["text", "label"], "yelp_review_full": ["text", "label"], @@ -213,10 +214,15 @@ def get_split(split_name, dataset_name): dataset = load_dataset(dataset_name, split=split_name, cache_dir=self._data_path) new_features = dataset.features.copy() - new_features["label"] = datasets.ClassLabel(num_classes=35) + # the following is hack needed because the output space of the new dataset is + # the union of the output spaces of the single datasets + new_features[self._multi_dataset_info[dataset_name][1]] = datasets.ClassLabel( + num_classes=33 + ) dataset = dataset.cast(new_features) - + rnd_idx = torch.randint(low=0, high=len(dataset), size=(self._train_size,)).tolist() + dataset = dataset.select(indices=rnd_idx) dataset = dataset.map( functools.partial( preprocess, @@ -228,8 +234,6 @@ def get_split(split_name, dataset_name): num_proc=4, ) - print(dataset) - print(dataset.features) dataset.set_format(type="torch") return _InputTargetWrapper(dataset) From fe325e849543de17ab65b4216d4fb8d99b479ac0 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Tue, 25 Jul 2023 16:51:38 +0200 Subject: [PATCH 03/37] add test for new data module --- .../benchmark/datasets/test_multi_data_nlp.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 test/renate/benchmark/datasets/test_multi_data_nlp.py diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py new file mode 100644 index 00000000..b6b0403a --- /dev/null +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -0,0 +1,26 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +import pytest +import transformers as transformers + +from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule + + +@pytest.mark.skip(reason="This test requires downloading five datasets.") +def test_multi_data_nlp(): + TRAIN_SIZE = 100 + + data = MultiTextDataModule( + "./remove_folder/", + train_size=TRAIN_SIZE, + tokenizer=transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased"), + ) + + data.prepare_data() + data.setup() + + assert len(data.train_data()) == 5 + assert len(data.test_data()) == 5 + + for i in range(len(data.train_data())): + assert len(data.train_data()[i]._dataset) == TRAIN_SIZE From b21c0b40c00e9820ae25693c15c71b5461d04620 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 11:38:29 +0200 Subject: [PATCH 04/37] Made implementation thread safe. Add dataset selection to speedup loading. --- src/renate/benchmark/datasets/nlp_datasets.py | 137 ++++++++++++------ .../benchmark/datasets/test_multi_data_nlp.py | 38 +++-- 2 files changed, 114 insertions(+), 61 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 0935f336..e0f5c760 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -140,16 +140,16 @@ def tokenize_fn(batch): class MultiTextDataModule(RenateDataModule): """ - Inspired by the dataset used in "Episodic Memory in Lifelong Language Learning" by d’Autume et al. - it is a collection of five different datasets: AGNews, Yelp, Amazon reviews, DBPedia, Yahoo Answers. + Inspired by the dataset used in "Episodic Memory in Lifelong Language Learning" + by d’Autume et al. it is a collection of five different datasets that we call domains: + AGNews, Yelp, Amazon reviews, DBPedia and Yahoo Answers. - The output space if the union of the output space of all the datasets. + The output space if the union of the output space of all the domains. The dataset has 33 classes: 4 from AGNews, 5 from Yelp, 14 from DBPedia, 5 from Amazon reviews, 10 from Yahoo. Amazon and Yelp have similar semantics and the classes have been merged. - The maximum allowed training set size is 115k (the size of the smallest dataset). - The dataset is balanced across datasets by construction and each data batch contains data from - a single dataset. + The maximum allowed size for the training set is 115k and for the test set is 7600. + Each domain will have the same fixed size. Args: data_path: the path to the folder where the data files will be downloaded to. @@ -160,7 +160,9 @@ class MultiTextDataModule(RenateDataModule): See https://huggingface.co/docs/tokenizers/ for more information on tokenizers. If `None` is passed, this defaults to `{"padding": "max_length", max_length: 128, truncation: True}`. - train_size: the size of the data batch, must be smaller than 115k. + domain: the dataset to be used + train_size: the size of the data stored as training set, must be smaller than 115000. + test_size: the size of the data stored as test set, must be smaller than 7600. val_size: Fraction of the training data to be used for validation. seed: Seed used to fix random number generation. """ @@ -169,8 +171,10 @@ def __init__( self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, + domain: str, tokenizer_kwargs: Optional[Dict[str, Any]] = None, train_size: int = 1000, + test_size: int = 1000, val_size: float = defaults.VALIDATION_SIZE, seed: int = defaults.SEED, ): @@ -179,9 +183,17 @@ def __init__( val_size=val_size, seed=seed, ) + + if train_size > 115000: + raise ValueError("The `train_size` must be smaller than 115000") + self._train_size = train_size + + if test_size > 7600: + raise ValueError("The `test_size` must be smaller than 7600") + self._test_size = test_size + self._tokenizer = tokenizer self._tokenizer_kwargs = tokenizer_kwargs or defaults.TOKENIZER_KWARGS - self._train_size = train_size self._multi_dataset_info = { "ag_news": ["text", "label"], "yelp_review_full": ["text", "label"], @@ -189,46 +201,98 @@ def __init__( "dbpedia_14": ["content", "label"], "yahoo_answers_topics": ["question_title", "topic"], } - self._observed_labels = {} + self._labels_map = { + "ag_news0": 0, + "ag_news1": 1, + "ag_news3": 2, + "ag_news2": 3, + "amazon_reviews_multi1": 4, + "amazon_reviews_multi4": 5, + "amazon_reviews_multi5": 6, + "amazon_reviews_multi3": 7, + "amazon_reviews_multi2": 8, + "dbpedia_140": 9, + "dbpedia_141": 10, + "dbpedia_142": 11, + "dbpedia_143": 12, + "dbpedia_144": 13, + "dbpedia_145": 14, + "dbpedia_146": 15, + "dbpedia_147": 16, + "dbpedia_148": 17, + "dbpedia_149": 18, + "dbpedia_1410": 19, + "dbpedia_1411": 20, + "dbpedia_1412": 21, + "dbpedia_1413": 22, + "yahoo_answers_topics0": 23, + "yahoo_answers_topics1": 24, + "yahoo_answers_topics2": 25, + "yahoo_answers_topics3": 26, + "yahoo_answers_topics4": 27, + "yahoo_answers_topics5": 28, + "yahoo_answers_topics6": 29, + "yahoo_answers_topics7": 30, + "yahoo_answers_topics8": 31, + "yahoo_answers_topics9": 32, + # yelp gets the same label ids as Amazon reviews + "yelp_review_full0": 4, + "yelp_review_full1": 5, + "yelp_review_full2": 6, + "yelp_review_full3": 7, + "yelp_review_full4": 8, + } + + if domain not in self._multi_dataset_info.keys(): + raise ValueError( + f"The selected domain is not available. Select one among {self._multi_dataset_info.keys()}" + ) + + self._domain = domain def prepare_data(self) -> None: """Download dataset.""" - for dataset in self._multi_dataset_info.keys(): - for split in ["train", "test"] + (["validation"] if self._val_size > 0 else []): - if dataset == "amazon_reviews_multi": - load_dataset(dataset, name="en", split=split, cache_dir=self._data_path) - else: - load_dataset(dataset, split=split, cache_dir=self._data_path) + for split in ["train", "test"] + (["validation"] if self._val_size > 0 else []): + if self._domain == "amazon_reviews_multi": + load_dataset(self._domain, name="en", split=split, cache_dir=self._data_path) + else: + load_dataset(self._domain, split=split, cache_dir=self._data_path) def setup(self) -> None: """Set up train, test and val datasets.""" - def preprocess(example, dataset_name, text_field_name, label_field_name): + def preprocess(example, text_field_name, label_field_name): return { **self._tokenizer(example[text_field_name], **self._tokenizer_kwargs), - "label": get_label(dataset_name + str(example[label_field_name])), + "label": self._labels_map[f"{self._domain}{example[label_field_name]}"], } - def get_split(split_name, dataset_name): - dataset = load_dataset(dataset_name, split=split_name, cache_dir=self._data_path) + def get_split(split_name): + dataset = load_dataset(self._domain, split=split_name, cache_dir=self._data_path) new_features = dataset.features.copy() # the following is hack needed because the output space of the new dataset is # the union of the output spaces of the single datasets - new_features[self._multi_dataset_info[dataset_name][1]] = datasets.ClassLabel( + new_features[self._multi_dataset_info[self._domain][1]] = datasets.ClassLabel( num_classes=33 ) dataset = dataset.cast(new_features) - rnd_idx = torch.randint(low=0, high=len(dataset), size=(self._train_size,)).tolist() + + if "train" in split_name: + set_size = self._train_size + else: + set_size = self._test_size + + rnd_idx = torch.randint(low=0, high=len(dataset), size=(set_size,)).tolist() + dataset = dataset.select(indices=rnd_idx) dataset = dataset.map( functools.partial( preprocess, - dataset_name=dataset_name, - text_field_name=self._multi_dataset_info[dataset_name][0], - label_field_name=self._multi_dataset_info[dataset_name][1], + text_field_name=self._multi_dataset_info[self._domain][0], + label_field_name=self._multi_dataset_info[self._domain][1], ), remove_columns=list(dataset.features), num_proc=4, @@ -238,29 +302,12 @@ def get_split(split_name, dataset_name): return _InputTargetWrapper(dataset) - def get_label(label: str): - if "yelp_review_full" in label: - label = label.replace("yelp_review_full", "amazon_reviews_multi") - - if len(self._observed_labels.keys()) == 0: - self._observed_labels[label] = 0 - return 0 - - elif label not in self._observed_labels.keys(): - max_val = max(self._observed_labels.values()) - self._observed_labels[label] = max_val + 1 - return max_val + 1 - - else: - return self._observed_labels[label] - self._train_data = [] self._test_data = [] if self._val_size > 0: self._val_data = [] - for dataset in self._multi_dataset_info: - self._train_data.append(get_split("train", dataset)) - self._test_data.append(get_split("test", dataset)) - if self._val_size > 0: - self._val_data.append(get_split("validation", dataset)) + self._train_data = get_split("train") + self._test_data = get_split("test") + if self._val_size > 0: + self._val_data = get_split("validation") diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index b6b0403a..d3ccfb9d 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -1,26 +1,32 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 -import pytest import transformers as transformers from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule -@pytest.mark.skip(reason="This test requires downloading five datasets.") -def test_multi_data_nlp(): - TRAIN_SIZE = 100 +# @pytest.mark.skip(reason="This test requires downloading and processing five datasets.") +def test_multi_data_nlp_full(): + TRAIN_SIZE = 115000 + TEST_SIZE = 7600 - data = MultiTextDataModule( - "./remove_folder/", - train_size=TRAIN_SIZE, - tokenizer=transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased"), - ) + for d in [ + "ag_news", + "yelp_review_full", + "amazon_reviews_multi", + "dbpedia_14", + "yahoo_answers_topics", + ]: + data = MultiTextDataModule( + "./remove_folder/", + train_size=TRAIN_SIZE, + test_size=TEST_SIZE, + domain=d, + tokenizer=transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased"), + ) - data.prepare_data() - data.setup() + data.prepare_data() + data.setup() - assert len(data.train_data()) == 5 - assert len(data.test_data()) == 5 - - for i in range(len(data.train_data())): - assert len(data.train_data()[i]._dataset) == TRAIN_SIZE + assert len(data.train_data()) == TRAIN_SIZE + assert len(data.test_data()) == TEST_SIZE From 0dd1618688acb509d537a57a8b41e21ba90ca634 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 11:41:28 +0200 Subject: [PATCH 05/37] fix labels map --- src/renate/benchmark/datasets/nlp_datasets.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index e0f5c760..722326f9 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -207,10 +207,10 @@ def __init__( "ag_news3": 2, "ag_news2": 3, "amazon_reviews_multi1": 4, - "amazon_reviews_multi4": 5, - "amazon_reviews_multi5": 6, - "amazon_reviews_multi3": 7, - "amazon_reviews_multi2": 8, + "amazon_reviews_multi2": 5, + "amazon_reviews_multi3": 6, + "amazon_reviews_multi4": 7, + "amazon_reviews_multi5": 8, "dbpedia_140": 9, "dbpedia_141": 10, "dbpedia_142": 11, From 2198d2aa2bad90290f81ace45d2417c977c9cb9a Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 11:44:51 +0200 Subject: [PATCH 06/37] Add quick test --- .../benchmark/datasets/test_multi_data_nlp.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index d3ccfb9d..d2602e8a 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -5,6 +5,25 @@ from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule +def test_multi_data_nlp_small(): + TRAIN_SIZE = 100 + TEST_SIZE = 100 + + data = MultiTextDataModule( + "./remove_folder/", + train_size=TRAIN_SIZE, + test_size=TEST_SIZE, + domain="ag_news", + tokenizer=transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased"), + ) + + data.prepare_data() + data.setup() + + assert len(data.train_data()) == TRAIN_SIZE + assert len(data.test_data()) == TEST_SIZE + + # @pytest.mark.skip(reason="This test requires downloading and processing five datasets.") def test_multi_data_nlp_full(): TRAIN_SIZE = 115000 From c4404a91c275a043b33a4bb3d1e4118e089d4b2d Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 11:54:21 +0200 Subject: [PATCH 07/37] fix docstring and defaults. remove unused code. --- src/renate/benchmark/datasets/nlp_datasets.py | 25 ++++++++----------- src/renate/defaults.py | 3 +++ 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 722326f9..7f23bf78 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -141,18 +141,18 @@ def tokenize_fn(batch): class MultiTextDataModule(RenateDataModule): """ Inspired by the dataset used in "Episodic Memory in Lifelong Language Learning" - by d’Autume et al. it is a collection of five different datasets that we call domains: + by d’Autume et al. this is a collection of five different datasets that we call domains: AGNews, Yelp, Amazon reviews, DBPedia and Yahoo Answers. The output space if the union of the output space of all the domains. The dataset has 33 classes: 4 from AGNews, 5 from Yelp, 14 from DBPedia, 5 from Amazon reviews, - 10 from Yahoo. Amazon and Yelp have similar semantics and the classes have been merged. + and 10 from Yahoo. Amazon and Yelp have similar semantics and the classes have been merged. - The maximum allowed size for the training set is 115k and for the test set is 7600. + The maximum allowed size for the training set is 115000 and for the test set is 7600. Each domain will have the same fixed size. Args: - data_path: the path to the folder where the data files will be downloaded to. + data_path: The path to the folder where the data files will be downloaded to. tokenizer: Tokenizer to apply to the dataset. See https://huggingface.co/docs/tokenizers/ for more information on tokenizers. tokenizer_kwargs: Keyword arguments passed when calling the tokenizer's ``__call__`` @@ -160,9 +160,9 @@ class MultiTextDataModule(RenateDataModule): See https://huggingface.co/docs/tokenizers/ for more information on tokenizers. If `None` is passed, this defaults to `{"padding": "max_length", max_length: 128, truncation: True}`. - domain: the dataset to be used - train_size: the size of the data stored as training set, must be smaller than 115000. - test_size: the size of the data stored as test set, must be smaller than 7600. + domain: The dataset to be used + train_size: The size of the data stored as training set, must be smaller than 115000. + test_size: The size of the data stored as test set, must be smaller than 7600. val_size: Fraction of the training data to be used for validation. seed: Seed used to fix random number generation. """ @@ -173,8 +173,8 @@ def __init__( tokenizer: transformers.PreTrainedTokenizer, domain: str, tokenizer_kwargs: Optional[Dict[str, Any]] = None, - train_size: int = 1000, - test_size: int = 1000, + train_size: int = defaults.TRAIN_SET_SIZE, + test_size: int = defaults.TEST_SET_SIZE, val_size: float = defaults.VALIDATION_SIZE, seed: int = defaults.SEED, ): @@ -286,8 +286,8 @@ def get_split(split_name): set_size = self._test_size rnd_idx = torch.randint(low=0, high=len(dataset), size=(set_size,)).tolist() - dataset = dataset.select(indices=rnd_idx) + dataset = dataset.map( functools.partial( preprocess, @@ -302,11 +302,6 @@ def get_split(split_name): return _InputTargetWrapper(dataset) - self._train_data = [] - self._test_data = [] - if self._val_size > 0: - self._val_data = [] - self._train_data = get_split("train") self._test_data = get_split("test") if self._val_size > 0: diff --git a/src/renate/defaults.py b/src/renate/defaults.py index 8e193270..26cf9952 100644 --- a/src/renate/defaults.py +++ b/src/renate/defaults.py @@ -104,6 +104,9 @@ # Benchmark datasets/models TOKENIZER_KWARGS = {"padding": "max_length", "max_length": 128, "truncation": True} +TRAIN_SET_SIZE = 1000 +TEST_SET_SIZE = 1000 +s def scheduler(config_space: Dict[str, Any], mode: str, metric: str): From 15c07451b4ad1d3c117bec1ff7f9b7832c99489d Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 11:55:11 +0200 Subject: [PATCH 08/37] fix typo --- src/renate/defaults.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/renate/defaults.py b/src/renate/defaults.py index 26cf9952..700a67e4 100644 --- a/src/renate/defaults.py +++ b/src/renate/defaults.py @@ -106,7 +106,6 @@ TOKENIZER_KWARGS = {"padding": "max_length", "max_length": 128, "truncation": True} TRAIN_SET_SIZE = 1000 TEST_SET_SIZE = 1000 -s def scheduler(config_space: Dict[str, Any], mode: str, metric: str): From cb4d723f60f1f455c68cbfdf22189e5b5fc79587 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 12:00:06 +0200 Subject: [PATCH 09/37] make flake happy --- src/renate/benchmark/datasets/nlp_datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 7f23bf78..271ba317 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -245,7 +245,8 @@ def __init__( if domain not in self._multi_dataset_info.keys(): raise ValueError( - f"The selected domain is not available. Select one among {self._multi_dataset_info.keys()}" + f"The selected domain is not available. Select one among " + f"{self._multi_dataset_info.keys()}" ) self._domain = domain From ef956b38f8ad8958164cb99dc8a7ebf6897ee3bb Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 13:31:05 +0200 Subject: [PATCH 10/37] skip long test --- test/renate/benchmark/datasets/test_multi_data_nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index d2602e8a..d1a2b281 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -24,7 +24,7 @@ def test_multi_data_nlp_small(): assert len(data.test_data()) == TEST_SIZE -# @pytest.mark.skip(reason="This test requires downloading and processing five datasets.") +@pytest.mark.skip(reason="This test requires downloading and processing five datasets.") def test_multi_data_nlp_full(): TRAIN_SIZE = 115000 TEST_SIZE = 7600 From 2d3de75d99ab3776c3c9dc7d59d454f9722cadf4 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 13:54:35 +0200 Subject: [PATCH 11/37] import pytest --- test/renate/benchmark/datasets/test_multi_data_nlp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index d1a2b281..c318f562 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -1,5 +1,6 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 +import pytest import transformers as transformers from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule From 5eac7fc0427f7ebd5d5f5bbd3135fb5de6d5d2c2 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Mon, 31 Jul 2023 10:39:50 +0200 Subject: [PATCH 12/37] skip test --- test/renate/benchmark/datasets/test_multi_data_nlp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index c318f562..f4dcb24e 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -6,6 +6,7 @@ from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule +@pytest.mark.skip(reason="This test create problems with the syne-tune redirect test") def test_multi_data_nlp_small(): TRAIN_SIZE = 100 TEST_SIZE = 100 From d5c83335e959ae87098a2572434957a031ee5d5e Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Mon, 31 Jul 2023 15:53:45 +0200 Subject: [PATCH 13/37] rename domain --- src/renate/benchmark/datasets/nlp_datasets.py | 19 +++++++------- .../benchmark/datasets/test_multi_data_nlp.py | 26 +++++++++++++------ 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 271ba317..8ded5358 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -249,16 +249,17 @@ def __init__( f"{self._multi_dataset_info.keys()}" ) - self._domain = domain + self.domain = domain + self.available_domains = self._multi_dataset_info.keys() def prepare_data(self) -> None: """Download dataset.""" for split in ["train", "test"] + (["validation"] if self._val_size > 0 else []): - if self._domain == "amazon_reviews_multi": - load_dataset(self._domain, name="en", split=split, cache_dir=self._data_path) + if "amazon" in self.domain: + load_dataset(self.domain, name="en", split=split, cache_dir=self._data_path) else: - load_dataset(self._domain, split=split, cache_dir=self._data_path) + load_dataset(self.domain, split=split, cache_dir=self._data_path) def setup(self) -> None: """Set up train, test and val datasets.""" @@ -266,16 +267,16 @@ def setup(self) -> None: def preprocess(example, text_field_name, label_field_name): return { **self._tokenizer(example[text_field_name], **self._tokenizer_kwargs), - "label": self._labels_map[f"{self._domain}{example[label_field_name]}"], + "label": self._labels_map[f"{self.domain}{example[label_field_name]}"], } def get_split(split_name): - dataset = load_dataset(self._domain, split=split_name, cache_dir=self._data_path) + dataset = load_dataset(self.domain, split=split_name, cache_dir=self._data_path) new_features = dataset.features.copy() # the following is hack needed because the output space of the new dataset is # the union of the output spaces of the single datasets - new_features[self._multi_dataset_info[self._domain][1]] = datasets.ClassLabel( + new_features[self._multi_dataset_info[self.domain][1]] = datasets.ClassLabel( num_classes=33 ) @@ -292,8 +293,8 @@ def get_split(split_name): dataset = dataset.map( functools.partial( preprocess, - text_field_name=self._multi_dataset_info[self._domain][0], - label_field_name=self._multi_dataset_info[self._domain][1], + text_field_name=self._multi_dataset_info[self.domain][0], + label_field_name=self._multi_dataset_info[self.domain][1], ), remove_columns=list(dataset.features), num_proc=4, diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index f4dcb24e..cd14a307 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -1,21 +1,20 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 -import pytest import transformers as transformers from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule @pytest.mark.skip(reason="This test create problems with the syne-tune redirect test") -def test_multi_data_nlp_small(): +def test_multi_data_nlp_small(tmpdir): TRAIN_SIZE = 100 TEST_SIZE = 100 data = MultiTextDataModule( - "./remove_folder/", + tmpdir, train_size=TRAIN_SIZE, test_size=TEST_SIZE, - domain="ag_news", + domain="amazon_reviews_multi", tokenizer=transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased"), ) @@ -25,9 +24,18 @@ def test_multi_data_nlp_small(): assert len(data.train_data()) == TRAIN_SIZE assert len(data.test_data()) == TEST_SIZE + data.domain = "dbpedia_14" + data.prepare_data() + data.setup() + + tr_data_dbpedia = data.train_data() + te_data_dbpedia = data.test_data() + assert len(tr_data_dbpedia) == TRAIN_SIZE + assert len(te_data_dbpedia) == TEST_SIZE + @pytest.mark.skip(reason="This test requires downloading and processing five datasets.") -def test_multi_data_nlp_full(): +def test_multi_data_nlp_full(tmpdir): TRAIN_SIZE = 115000 TEST_SIZE = 7600 @@ -39,7 +47,7 @@ def test_multi_data_nlp_full(): "yahoo_answers_topics", ]: data = MultiTextDataModule( - "./remove_folder/", + tmpdir, train_size=TRAIN_SIZE, test_size=TEST_SIZE, domain=d, @@ -49,5 +57,7 @@ def test_multi_data_nlp_full(): data.prepare_data() data.setup() - assert len(data.train_data()) == TRAIN_SIZE - assert len(data.test_data()) == TEST_SIZE + tr_data_agnews = data.train_data() + te_data_agnews = data.test_data() + assert len(tr_data_agnews) == TRAIN_SIZE + assert len(te_data_agnews) == TEST_SIZE From 40483508197ad5d2f4c77d6af5026d0389276645 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Mon, 31 Jul 2023 15:54:39 +0200 Subject: [PATCH 14/37] import pytest --- test/renate/benchmark/datasets/test_multi_data_nlp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index cd14a307..f69040be 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -1,5 +1,6 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 +import pytest import transformers as transformers from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule From 6f81901a2e969fc2a8fa6446253ec57ac165d404 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Tue, 1 Aug 2023 16:22:23 +0200 Subject: [PATCH 15/37] remove amazon reviews --- src/renate/benchmark/datasets/nlp_datasets.py | 27 ++++++------------- .../benchmark/datasets/test_multi_data_nlp.py | 3 +-- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 8ded5358..9ee04fd4 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -142,11 +142,10 @@ class MultiTextDataModule(RenateDataModule): """ Inspired by the dataset used in "Episodic Memory in Lifelong Language Learning" by d’Autume et al. this is a collection of five different datasets that we call domains: - AGNews, Yelp, Amazon reviews, DBPedia and Yahoo Answers. + AGNews, Yelp, DBPedia and Yahoo Answers. The output space if the union of the output space of all the domains. - The dataset has 33 classes: 4 from AGNews, 5 from Yelp, 14 from DBPedia, 5 from Amazon reviews, - and 10 from Yahoo. Amazon and Yelp have similar semantics and the classes have been merged. + The dataset has 33 classes: 4 from AGNews, 5 from Yelp, 14 from DBPedia, and 10 from Yahoo. The maximum allowed size for the training set is 115000 and for the test set is 7600. Each domain will have the same fixed size. @@ -197,7 +196,6 @@ def __init__( self._multi_dataset_info = { "ag_news": ["text", "label"], "yelp_review_full": ["text", "label"], - "amazon_reviews_multi": ["review_body", "stars"], "dbpedia_14": ["content", "label"], "yahoo_answers_topics": ["question_title", "topic"], } @@ -206,11 +204,11 @@ def __init__( "ag_news1": 1, "ag_news3": 2, "ag_news2": 3, - "amazon_reviews_multi1": 4, - "amazon_reviews_multi2": 5, - "amazon_reviews_multi3": 6, - "amazon_reviews_multi4": 7, - "amazon_reviews_multi5": 8, + "yelp_review_full0": 4, + "yelp_review_full1": 5, + "yelp_review_full2": 6, + "yelp_review_full3": 7, + "yelp_review_full4": 8, "dbpedia_140": 9, "dbpedia_141": 10, "dbpedia_142": 11, @@ -235,12 +233,6 @@ def __init__( "yahoo_answers_topics7": 30, "yahoo_answers_topics8": 31, "yahoo_answers_topics9": 32, - # yelp gets the same label ids as Amazon reviews - "yelp_review_full0": 4, - "yelp_review_full1": 5, - "yelp_review_full2": 6, - "yelp_review_full3": 7, - "yelp_review_full4": 8, } if domain not in self._multi_dataset_info.keys(): @@ -256,10 +248,7 @@ def prepare_data(self) -> None: """Download dataset.""" for split in ["train", "test"] + (["validation"] if self._val_size > 0 else []): - if "amazon" in self.domain: - load_dataset(self.domain, name="en", split=split, cache_dir=self._data_path) - else: - load_dataset(self.domain, split=split, cache_dir=self._data_path) + load_dataset(self.domain, split=split, cache_dir=self._data_path) def setup(self) -> None: """Set up train, test and val datasets.""" diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index f69040be..58677c73 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -15,7 +15,7 @@ def test_multi_data_nlp_small(tmpdir): tmpdir, train_size=TRAIN_SIZE, test_size=TEST_SIZE, - domain="amazon_reviews_multi", + domain="ag_news", tokenizer=transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased"), ) @@ -43,7 +43,6 @@ def test_multi_data_nlp_full(tmpdir): for d in [ "ag_news", "yelp_review_full", - "amazon_reviews_multi", "dbpedia_14", "yahoo_answers_topics", ]: From 015ef3f714937e2e9f3a7d4694dc621b76ff1dda Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Tue, 25 Jul 2023 14:53:03 +0200 Subject: [PATCH 16/37] WIP implementation of multi-dataset NLP benchmark --- src/renate/benchmark/datasets/nlp_datasets.py | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 906cdaae..698329ca 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -1,11 +1,13 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 +import functools import logging from typing import Any, Dict, Optional import datasets import torch import transformers +from datasets import load_dataset from renate import defaults from renate.data.data_module import RenateDataModule @@ -134,3 +136,127 @@ def tokenize_fn(batch): self._val_data = _InputTargetWrapper(self._val_data, self._target_column) else: self._train_data, self._val_data = self._split_train_val_data(self._train_data) + + +class MultiTextDataModule(RenateDataModule): + """ + Inspired by the dataset used in "Episodic Memory in Lifelong Language Learning" by d’Autume et al. + it is a collection of five different datasets: AGNews, Yelp, Amazon reviews, DBPedia, Yahoo Answers. + + The output space if the union of the output space of all the datasets. + The dataset has 33 classes: 4 from AGNews, 5 from Yelp, 14 from DBPedia, 5 from Amazon reviews, + 10 from Yahoo. Amazon and Yelp have similar semantics and the classes have been merged. + + The maximum allowed training set size is 115k (the size of the smallest dataset). + The dataset is balanced across datasets by construction and each data batch contains data from + a single dataset. + + Args: + data_path: the path to the folder where the data files will be downloaded to. + tokenizer: Tokenizer to apply to the dataset. See https://huggingface.co/docs/tokenizers/ + for more information on tokenizers. + tokenizer_kwargs: Keyword arguments passed when calling the tokenizer's ``__call__`` + function. Typical options are `max_length`, `padding` and `truncation`. + See https://huggingface.co/docs/tokenizers/ + for more information on tokenizers. If `None` is passed, this defaults to + `{"padding": "max_length", max_length: 128, truncation: True}`. + train_size: the size of the data batch, must be smaller than 115k. + val_size: Fraction of the training data to be used for validation. + seed: Seed used to fix random number generation. + """ + + def __init__( + self, + data_path: str, + tokenizer: transformers.PreTrainedTokenizer, + tokenizer_kwargs: Optional[Dict[str, Any]] = None, + train_size: int = 115000, + val_size: float = defaults.VALIDATION_SIZE, + seed: int = defaults.SEED, + ): + super().__init__( + data_path=data_path, + val_size=val_size, + seed=seed, + ) + self._tokenizer = tokenizer + self._tokenizer_kwargs = tokenizer_kwargs or defaults.TOKENIZER_KWARGS + self._multi_dataset_info = { + "ag_news": ["text", "label"], + "yelp_review_full": ["text", "label"], + "amazon_reviews_multi": ["review_body", "stars"], + "dbpedia_14": ["content", "label"], + "yahoo_answers_topics": ["question_title", "topic"], + } + self._observed_labels = {} + + def prepare_data(self) -> None: + """Download dataset.""" + + for dataset in self._multi_dataset_info.keys(): + for split in ["train", "test"] + (["validation"] if self._val_size > 0 else []): + if dataset == "amazon_reviews_multi": + load_dataset(dataset, name="en", split=split, cache_dir=self._data_path) + else: + load_dataset(dataset, split=split, cache_dir=self._data_path) + + def setup(self) -> None: + """Set up train, test and val datasets.""" + + def preprocess(example, dataset_name, text_field_name, label_field_name): + return { + **self._tokenizer(example[text_field_name], **self._tokenizer_kwargs), + "label": get_label(dataset_name + str(example[label_field_name])), + } + + def get_split(split_name, dataset_name): + dataset = load_dataset(dataset_name, split=split_name, cache_dir=self._data_path) + + new_features = dataset.features.copy() + new_features["label"] = datasets.ClassLabel(num_classes=35) + + dataset = dataset.cast(new_features) + + dataset = dataset.map( + functools.partial( + preprocess, + dataset_name=dataset_name, + text_field_name=self._multi_dataset_info[dataset_name][0], + label_field_name=self._multi_dataset_info[dataset_name][1], + ), + remove_columns=list(dataset.features), + num_proc=4, + ) + + print(dataset) + print(dataset.features) + dataset.set_format(type="torch") + + return _InputTargetWrapper(dataset) + + def get_label(label: str): + if "yelp_review_full" in label: + label = label.replace("yelp_review_full", "amazon_reviews_multi") + + if len(self._observed_labels.keys()) == 0: + self._observed_labels[label] = 0 + return 0 + + elif label not in self._observed_labels.keys(): + max_val = max(self._observed_labels.values()) + self._observed_labels[label] = max_val + 1 + return max_val + 1 + + else: + return self._observed_labels[label] + + self._train_data = [] + self._test_data = [] + if self._val_size > 0: + self._val_data = [] + + for dataset in self._multi_dataset_info: + self._train_data.append(get_split("train", dataset)) + self._test_data.append(get_split("test", dataset)) + if self._val_size > 0: + self._val_data.append(get_split("validation", dataset)) From dc5d80e3049e23f88a10d9f55b188f276c3f08cc Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Tue, 25 Jul 2023 16:51:05 +0200 Subject: [PATCH 17/37] First implementation of 5 datasets NLP benchmark --- src/renate/benchmark/datasets/nlp_datasets.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 698329ca..0935f336 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -170,7 +170,7 @@ def __init__( data_path: str, tokenizer: transformers.PreTrainedTokenizer, tokenizer_kwargs: Optional[Dict[str, Any]] = None, - train_size: int = 115000, + train_size: int = 1000, val_size: float = defaults.VALIDATION_SIZE, seed: int = defaults.SEED, ): @@ -181,6 +181,7 @@ def __init__( ) self._tokenizer = tokenizer self._tokenizer_kwargs = tokenizer_kwargs or defaults.TOKENIZER_KWARGS + self._train_size = train_size self._multi_dataset_info = { "ag_news": ["text", "label"], "yelp_review_full": ["text", "label"], @@ -213,10 +214,15 @@ def get_split(split_name, dataset_name): dataset = load_dataset(dataset_name, split=split_name, cache_dir=self._data_path) new_features = dataset.features.copy() - new_features["label"] = datasets.ClassLabel(num_classes=35) + # the following is hack needed because the output space of the new dataset is + # the union of the output spaces of the single datasets + new_features[self._multi_dataset_info[dataset_name][1]] = datasets.ClassLabel( + num_classes=33 + ) dataset = dataset.cast(new_features) - + rnd_idx = torch.randint(low=0, high=len(dataset), size=(self._train_size,)).tolist() + dataset = dataset.select(indices=rnd_idx) dataset = dataset.map( functools.partial( preprocess, @@ -228,8 +234,6 @@ def get_split(split_name, dataset_name): num_proc=4, ) - print(dataset) - print(dataset.features) dataset.set_format(type="torch") return _InputTargetWrapper(dataset) From 63ca3a69015dbdd548b70a06f34414af8eef1394 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Tue, 25 Jul 2023 16:51:38 +0200 Subject: [PATCH 18/37] add test for new data module --- .../benchmark/datasets/test_multi_data_nlp.py | 26 +++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 test/renate/benchmark/datasets/test_multi_data_nlp.py diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py new file mode 100644 index 00000000..b6b0403a --- /dev/null +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -0,0 +1,26 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 +import pytest +import transformers as transformers + +from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule + + +@pytest.mark.skip(reason="This test requires downloading five datasets.") +def test_multi_data_nlp(): + TRAIN_SIZE = 100 + + data = MultiTextDataModule( + "./remove_folder/", + train_size=TRAIN_SIZE, + tokenizer=transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased"), + ) + + data.prepare_data() + data.setup() + + assert len(data.train_data()) == 5 + assert len(data.test_data()) == 5 + + for i in range(len(data.train_data())): + assert len(data.train_data()[i]._dataset) == TRAIN_SIZE From 0b9a90b6b078449adc94342f9f1d0b51cb50aec7 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 11:38:29 +0200 Subject: [PATCH 19/37] Made implementation thread safe. Add dataset selection to speedup loading. --- src/renate/benchmark/datasets/nlp_datasets.py | 137 ++++++++++++------ .../benchmark/datasets/test_multi_data_nlp.py | 38 +++-- 2 files changed, 114 insertions(+), 61 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 0935f336..e0f5c760 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -140,16 +140,16 @@ def tokenize_fn(batch): class MultiTextDataModule(RenateDataModule): """ - Inspired by the dataset used in "Episodic Memory in Lifelong Language Learning" by d’Autume et al. - it is a collection of five different datasets: AGNews, Yelp, Amazon reviews, DBPedia, Yahoo Answers. + Inspired by the dataset used in "Episodic Memory in Lifelong Language Learning" + by d’Autume et al. it is a collection of five different datasets that we call domains: + AGNews, Yelp, Amazon reviews, DBPedia and Yahoo Answers. - The output space if the union of the output space of all the datasets. + The output space if the union of the output space of all the domains. The dataset has 33 classes: 4 from AGNews, 5 from Yelp, 14 from DBPedia, 5 from Amazon reviews, 10 from Yahoo. Amazon and Yelp have similar semantics and the classes have been merged. - The maximum allowed training set size is 115k (the size of the smallest dataset). - The dataset is balanced across datasets by construction and each data batch contains data from - a single dataset. + The maximum allowed size for the training set is 115k and for the test set is 7600. + Each domain will have the same fixed size. Args: data_path: the path to the folder where the data files will be downloaded to. @@ -160,7 +160,9 @@ class MultiTextDataModule(RenateDataModule): See https://huggingface.co/docs/tokenizers/ for more information on tokenizers. If `None` is passed, this defaults to `{"padding": "max_length", max_length: 128, truncation: True}`. - train_size: the size of the data batch, must be smaller than 115k. + domain: the dataset to be used + train_size: the size of the data stored as training set, must be smaller than 115000. + test_size: the size of the data stored as test set, must be smaller than 7600. val_size: Fraction of the training data to be used for validation. seed: Seed used to fix random number generation. """ @@ -169,8 +171,10 @@ def __init__( self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, + domain: str, tokenizer_kwargs: Optional[Dict[str, Any]] = None, train_size: int = 1000, + test_size: int = 1000, val_size: float = defaults.VALIDATION_SIZE, seed: int = defaults.SEED, ): @@ -179,9 +183,17 @@ def __init__( val_size=val_size, seed=seed, ) + + if train_size > 115000: + raise ValueError("The `train_size` must be smaller than 115000") + self._train_size = train_size + + if test_size > 7600: + raise ValueError("The `test_size` must be smaller than 7600") + self._test_size = test_size + self._tokenizer = tokenizer self._tokenizer_kwargs = tokenizer_kwargs or defaults.TOKENIZER_KWARGS - self._train_size = train_size self._multi_dataset_info = { "ag_news": ["text", "label"], "yelp_review_full": ["text", "label"], @@ -189,46 +201,98 @@ def __init__( "dbpedia_14": ["content", "label"], "yahoo_answers_topics": ["question_title", "topic"], } - self._observed_labels = {} + self._labels_map = { + "ag_news0": 0, + "ag_news1": 1, + "ag_news3": 2, + "ag_news2": 3, + "amazon_reviews_multi1": 4, + "amazon_reviews_multi4": 5, + "amazon_reviews_multi5": 6, + "amazon_reviews_multi3": 7, + "amazon_reviews_multi2": 8, + "dbpedia_140": 9, + "dbpedia_141": 10, + "dbpedia_142": 11, + "dbpedia_143": 12, + "dbpedia_144": 13, + "dbpedia_145": 14, + "dbpedia_146": 15, + "dbpedia_147": 16, + "dbpedia_148": 17, + "dbpedia_149": 18, + "dbpedia_1410": 19, + "dbpedia_1411": 20, + "dbpedia_1412": 21, + "dbpedia_1413": 22, + "yahoo_answers_topics0": 23, + "yahoo_answers_topics1": 24, + "yahoo_answers_topics2": 25, + "yahoo_answers_topics3": 26, + "yahoo_answers_topics4": 27, + "yahoo_answers_topics5": 28, + "yahoo_answers_topics6": 29, + "yahoo_answers_topics7": 30, + "yahoo_answers_topics8": 31, + "yahoo_answers_topics9": 32, + # yelp gets the same label ids as Amazon reviews + "yelp_review_full0": 4, + "yelp_review_full1": 5, + "yelp_review_full2": 6, + "yelp_review_full3": 7, + "yelp_review_full4": 8, + } + + if domain not in self._multi_dataset_info.keys(): + raise ValueError( + f"The selected domain is not available. Select one among {self._multi_dataset_info.keys()}" + ) + + self._domain = domain def prepare_data(self) -> None: """Download dataset.""" - for dataset in self._multi_dataset_info.keys(): - for split in ["train", "test"] + (["validation"] if self._val_size > 0 else []): - if dataset == "amazon_reviews_multi": - load_dataset(dataset, name="en", split=split, cache_dir=self._data_path) - else: - load_dataset(dataset, split=split, cache_dir=self._data_path) + for split in ["train", "test"] + (["validation"] if self._val_size > 0 else []): + if self._domain == "amazon_reviews_multi": + load_dataset(self._domain, name="en", split=split, cache_dir=self._data_path) + else: + load_dataset(self._domain, split=split, cache_dir=self._data_path) def setup(self) -> None: """Set up train, test and val datasets.""" - def preprocess(example, dataset_name, text_field_name, label_field_name): + def preprocess(example, text_field_name, label_field_name): return { **self._tokenizer(example[text_field_name], **self._tokenizer_kwargs), - "label": get_label(dataset_name + str(example[label_field_name])), + "label": self._labels_map[f"{self._domain}{example[label_field_name]}"], } - def get_split(split_name, dataset_name): - dataset = load_dataset(dataset_name, split=split_name, cache_dir=self._data_path) + def get_split(split_name): + dataset = load_dataset(self._domain, split=split_name, cache_dir=self._data_path) new_features = dataset.features.copy() # the following is hack needed because the output space of the new dataset is # the union of the output spaces of the single datasets - new_features[self._multi_dataset_info[dataset_name][1]] = datasets.ClassLabel( + new_features[self._multi_dataset_info[self._domain][1]] = datasets.ClassLabel( num_classes=33 ) dataset = dataset.cast(new_features) - rnd_idx = torch.randint(low=0, high=len(dataset), size=(self._train_size,)).tolist() + + if "train" in split_name: + set_size = self._train_size + else: + set_size = self._test_size + + rnd_idx = torch.randint(low=0, high=len(dataset), size=(set_size,)).tolist() + dataset = dataset.select(indices=rnd_idx) dataset = dataset.map( functools.partial( preprocess, - dataset_name=dataset_name, - text_field_name=self._multi_dataset_info[dataset_name][0], - label_field_name=self._multi_dataset_info[dataset_name][1], + text_field_name=self._multi_dataset_info[self._domain][0], + label_field_name=self._multi_dataset_info[self._domain][1], ), remove_columns=list(dataset.features), num_proc=4, @@ -238,29 +302,12 @@ def get_split(split_name, dataset_name): return _InputTargetWrapper(dataset) - def get_label(label: str): - if "yelp_review_full" in label: - label = label.replace("yelp_review_full", "amazon_reviews_multi") - - if len(self._observed_labels.keys()) == 0: - self._observed_labels[label] = 0 - return 0 - - elif label not in self._observed_labels.keys(): - max_val = max(self._observed_labels.values()) - self._observed_labels[label] = max_val + 1 - return max_val + 1 - - else: - return self._observed_labels[label] - self._train_data = [] self._test_data = [] if self._val_size > 0: self._val_data = [] - for dataset in self._multi_dataset_info: - self._train_data.append(get_split("train", dataset)) - self._test_data.append(get_split("test", dataset)) - if self._val_size > 0: - self._val_data.append(get_split("validation", dataset)) + self._train_data = get_split("train") + self._test_data = get_split("test") + if self._val_size > 0: + self._val_data = get_split("validation") diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index b6b0403a..d3ccfb9d 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -1,26 +1,32 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 -import pytest import transformers as transformers from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule -@pytest.mark.skip(reason="This test requires downloading five datasets.") -def test_multi_data_nlp(): - TRAIN_SIZE = 100 +# @pytest.mark.skip(reason="This test requires downloading and processing five datasets.") +def test_multi_data_nlp_full(): + TRAIN_SIZE = 115000 + TEST_SIZE = 7600 - data = MultiTextDataModule( - "./remove_folder/", - train_size=TRAIN_SIZE, - tokenizer=transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased"), - ) + for d in [ + "ag_news", + "yelp_review_full", + "amazon_reviews_multi", + "dbpedia_14", + "yahoo_answers_topics", + ]: + data = MultiTextDataModule( + "./remove_folder/", + train_size=TRAIN_SIZE, + test_size=TEST_SIZE, + domain=d, + tokenizer=transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased"), + ) - data.prepare_data() - data.setup() + data.prepare_data() + data.setup() - assert len(data.train_data()) == 5 - assert len(data.test_data()) == 5 - - for i in range(len(data.train_data())): - assert len(data.train_data()[i]._dataset) == TRAIN_SIZE + assert len(data.train_data()) == TRAIN_SIZE + assert len(data.test_data()) == TEST_SIZE From 1bfbc4df5a64dca71eed47968ab3b6f93c3cb8c8 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 11:41:28 +0200 Subject: [PATCH 20/37] fix labels map --- src/renate/benchmark/datasets/nlp_datasets.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index e0f5c760..722326f9 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -207,10 +207,10 @@ def __init__( "ag_news3": 2, "ag_news2": 3, "amazon_reviews_multi1": 4, - "amazon_reviews_multi4": 5, - "amazon_reviews_multi5": 6, - "amazon_reviews_multi3": 7, - "amazon_reviews_multi2": 8, + "amazon_reviews_multi2": 5, + "amazon_reviews_multi3": 6, + "amazon_reviews_multi4": 7, + "amazon_reviews_multi5": 8, "dbpedia_140": 9, "dbpedia_141": 10, "dbpedia_142": 11, From 7bc816bb5e105b1400e87bbdaad5b05c2a22aed7 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 11:44:51 +0200 Subject: [PATCH 21/37] Add quick test --- .../benchmark/datasets/test_multi_data_nlp.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index d3ccfb9d..d2602e8a 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -5,6 +5,25 @@ from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule +def test_multi_data_nlp_small(): + TRAIN_SIZE = 100 + TEST_SIZE = 100 + + data = MultiTextDataModule( + "./remove_folder/", + train_size=TRAIN_SIZE, + test_size=TEST_SIZE, + domain="ag_news", + tokenizer=transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased"), + ) + + data.prepare_data() + data.setup() + + assert len(data.train_data()) == TRAIN_SIZE + assert len(data.test_data()) == TEST_SIZE + + # @pytest.mark.skip(reason="This test requires downloading and processing five datasets.") def test_multi_data_nlp_full(): TRAIN_SIZE = 115000 From 678a4a76e6ae1c66973668b8b485ceefd6a40fba Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 11:54:21 +0200 Subject: [PATCH 22/37] fix docstring and defaults. remove unused code. --- src/renate/benchmark/datasets/nlp_datasets.py | 25 ++++++++----------- src/renate/defaults.py | 3 +++ 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 722326f9..7f23bf78 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -141,18 +141,18 @@ def tokenize_fn(batch): class MultiTextDataModule(RenateDataModule): """ Inspired by the dataset used in "Episodic Memory in Lifelong Language Learning" - by d’Autume et al. it is a collection of five different datasets that we call domains: + by d’Autume et al. this is a collection of five different datasets that we call domains: AGNews, Yelp, Amazon reviews, DBPedia and Yahoo Answers. The output space if the union of the output space of all the domains. The dataset has 33 classes: 4 from AGNews, 5 from Yelp, 14 from DBPedia, 5 from Amazon reviews, - 10 from Yahoo. Amazon and Yelp have similar semantics and the classes have been merged. + and 10 from Yahoo. Amazon and Yelp have similar semantics and the classes have been merged. - The maximum allowed size for the training set is 115k and for the test set is 7600. + The maximum allowed size for the training set is 115000 and for the test set is 7600. Each domain will have the same fixed size. Args: - data_path: the path to the folder where the data files will be downloaded to. + data_path: The path to the folder where the data files will be downloaded to. tokenizer: Tokenizer to apply to the dataset. See https://huggingface.co/docs/tokenizers/ for more information on tokenizers. tokenizer_kwargs: Keyword arguments passed when calling the tokenizer's ``__call__`` @@ -160,9 +160,9 @@ class MultiTextDataModule(RenateDataModule): See https://huggingface.co/docs/tokenizers/ for more information on tokenizers. If `None` is passed, this defaults to `{"padding": "max_length", max_length: 128, truncation: True}`. - domain: the dataset to be used - train_size: the size of the data stored as training set, must be smaller than 115000. - test_size: the size of the data stored as test set, must be smaller than 7600. + domain: The dataset to be used + train_size: The size of the data stored as training set, must be smaller than 115000. + test_size: The size of the data stored as test set, must be smaller than 7600. val_size: Fraction of the training data to be used for validation. seed: Seed used to fix random number generation. """ @@ -173,8 +173,8 @@ def __init__( tokenizer: transformers.PreTrainedTokenizer, domain: str, tokenizer_kwargs: Optional[Dict[str, Any]] = None, - train_size: int = 1000, - test_size: int = 1000, + train_size: int = defaults.TRAIN_SET_SIZE, + test_size: int = defaults.TEST_SET_SIZE, val_size: float = defaults.VALIDATION_SIZE, seed: int = defaults.SEED, ): @@ -286,8 +286,8 @@ def get_split(split_name): set_size = self._test_size rnd_idx = torch.randint(low=0, high=len(dataset), size=(set_size,)).tolist() - dataset = dataset.select(indices=rnd_idx) + dataset = dataset.map( functools.partial( preprocess, @@ -302,11 +302,6 @@ def get_split(split_name): return _InputTargetWrapper(dataset) - self._train_data = [] - self._test_data = [] - if self._val_size > 0: - self._val_data = [] - self._train_data = get_split("train") self._test_data = get_split("test") if self._val_size > 0: diff --git a/src/renate/defaults.py b/src/renate/defaults.py index 27eccd04..c9773da1 100644 --- a/src/renate/defaults.py +++ b/src/renate/defaults.py @@ -104,6 +104,9 @@ # Benchmark datasets/models TOKENIZER_KWARGS = {"padding": "max_length", "max_length": 128, "truncation": True} +TRAIN_SET_SIZE = 1000 +TEST_SET_SIZE = 1000 +s def scheduler(config_space: Dict[str, Any], mode: str, metric: str): From d31a13a38e29fbb51498df8a3db17063faa2cbd7 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 11:55:11 +0200 Subject: [PATCH 23/37] fix typo --- src/renate/defaults.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/renate/defaults.py b/src/renate/defaults.py index c9773da1..fb6566d3 100644 --- a/src/renate/defaults.py +++ b/src/renate/defaults.py @@ -106,7 +106,6 @@ TOKENIZER_KWARGS = {"padding": "max_length", "max_length": 128, "truncation": True} TRAIN_SET_SIZE = 1000 TEST_SET_SIZE = 1000 -s def scheduler(config_space: Dict[str, Any], mode: str, metric: str): From 0bd72ec4a68de29578505f00f851ffeb4820af2f Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 12:00:06 +0200 Subject: [PATCH 24/37] make flake happy --- src/renate/benchmark/datasets/nlp_datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 7f23bf78..271ba317 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -245,7 +245,8 @@ def __init__( if domain not in self._multi_dataset_info.keys(): raise ValueError( - f"The selected domain is not available. Select one among {self._multi_dataset_info.keys()}" + f"The selected domain is not available. Select one among " + f"{self._multi_dataset_info.keys()}" ) self._domain = domain From bf2f8f2a9411de32f5359f9a761e41d714ec1ae4 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 13:31:05 +0200 Subject: [PATCH 25/37] skip long test --- test/renate/benchmark/datasets/test_multi_data_nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index d2602e8a..d1a2b281 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -24,7 +24,7 @@ def test_multi_data_nlp_small(): assert len(data.test_data()) == TEST_SIZE -# @pytest.mark.skip(reason="This test requires downloading and processing five datasets.") +@pytest.mark.skip(reason="This test requires downloading and processing five datasets.") def test_multi_data_nlp_full(): TRAIN_SIZE = 115000 TEST_SIZE = 7600 From cb6fe3b7f346f8fd6bf87aa63b2a7c15f154df60 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 27 Jul 2023 13:54:35 +0200 Subject: [PATCH 26/37] import pytest --- test/renate/benchmark/datasets/test_multi_data_nlp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index d1a2b281..c318f562 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -1,5 +1,6 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 +import pytest import transformers as transformers from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule From df4d172fa69e7662f3e0533eb5c69d6c85141549 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Mon, 31 Jul 2023 10:39:50 +0200 Subject: [PATCH 27/37] skip test --- test/renate/benchmark/datasets/test_multi_data_nlp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index c318f562..f4dcb24e 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -6,6 +6,7 @@ from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule +@pytest.mark.skip(reason="This test create problems with the syne-tune redirect test") def test_multi_data_nlp_small(): TRAIN_SIZE = 100 TEST_SIZE = 100 From 5060e79881a715319c3774b625c5d2a1884a6604 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Mon, 31 Jul 2023 15:53:45 +0200 Subject: [PATCH 28/37] rename domain --- src/renate/benchmark/datasets/nlp_datasets.py | 19 +++++++------- .../benchmark/datasets/test_multi_data_nlp.py | 26 +++++++++++++------ 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 271ba317..8ded5358 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -249,16 +249,17 @@ def __init__( f"{self._multi_dataset_info.keys()}" ) - self._domain = domain + self.domain = domain + self.available_domains = self._multi_dataset_info.keys() def prepare_data(self) -> None: """Download dataset.""" for split in ["train", "test"] + (["validation"] if self._val_size > 0 else []): - if self._domain == "amazon_reviews_multi": - load_dataset(self._domain, name="en", split=split, cache_dir=self._data_path) + if "amazon" in self.domain: + load_dataset(self.domain, name="en", split=split, cache_dir=self._data_path) else: - load_dataset(self._domain, split=split, cache_dir=self._data_path) + load_dataset(self.domain, split=split, cache_dir=self._data_path) def setup(self) -> None: """Set up train, test and val datasets.""" @@ -266,16 +267,16 @@ def setup(self) -> None: def preprocess(example, text_field_name, label_field_name): return { **self._tokenizer(example[text_field_name], **self._tokenizer_kwargs), - "label": self._labels_map[f"{self._domain}{example[label_field_name]}"], + "label": self._labels_map[f"{self.domain}{example[label_field_name]}"], } def get_split(split_name): - dataset = load_dataset(self._domain, split=split_name, cache_dir=self._data_path) + dataset = load_dataset(self.domain, split=split_name, cache_dir=self._data_path) new_features = dataset.features.copy() # the following is hack needed because the output space of the new dataset is # the union of the output spaces of the single datasets - new_features[self._multi_dataset_info[self._domain][1]] = datasets.ClassLabel( + new_features[self._multi_dataset_info[self.domain][1]] = datasets.ClassLabel( num_classes=33 ) @@ -292,8 +293,8 @@ def get_split(split_name): dataset = dataset.map( functools.partial( preprocess, - text_field_name=self._multi_dataset_info[self._domain][0], - label_field_name=self._multi_dataset_info[self._domain][1], + text_field_name=self._multi_dataset_info[self.domain][0], + label_field_name=self._multi_dataset_info[self.domain][1], ), remove_columns=list(dataset.features), num_proc=4, diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index f4dcb24e..cd14a307 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -1,21 +1,20 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 -import pytest import transformers as transformers from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule @pytest.mark.skip(reason="This test create problems with the syne-tune redirect test") -def test_multi_data_nlp_small(): +def test_multi_data_nlp_small(tmpdir): TRAIN_SIZE = 100 TEST_SIZE = 100 data = MultiTextDataModule( - "./remove_folder/", + tmpdir, train_size=TRAIN_SIZE, test_size=TEST_SIZE, - domain="ag_news", + domain="amazon_reviews_multi", tokenizer=transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased"), ) @@ -25,9 +24,18 @@ def test_multi_data_nlp_small(): assert len(data.train_data()) == TRAIN_SIZE assert len(data.test_data()) == TEST_SIZE + data.domain = "dbpedia_14" + data.prepare_data() + data.setup() + + tr_data_dbpedia = data.train_data() + te_data_dbpedia = data.test_data() + assert len(tr_data_dbpedia) == TRAIN_SIZE + assert len(te_data_dbpedia) == TEST_SIZE + @pytest.mark.skip(reason="This test requires downloading and processing five datasets.") -def test_multi_data_nlp_full(): +def test_multi_data_nlp_full(tmpdir): TRAIN_SIZE = 115000 TEST_SIZE = 7600 @@ -39,7 +47,7 @@ def test_multi_data_nlp_full(): "yahoo_answers_topics", ]: data = MultiTextDataModule( - "./remove_folder/", + tmpdir, train_size=TRAIN_SIZE, test_size=TEST_SIZE, domain=d, @@ -49,5 +57,7 @@ def test_multi_data_nlp_full(): data.prepare_data() data.setup() - assert len(data.train_data()) == TRAIN_SIZE - assert len(data.test_data()) == TEST_SIZE + tr_data_agnews = data.train_data() + te_data_agnews = data.test_data() + assert len(tr_data_agnews) == TRAIN_SIZE + assert len(te_data_agnews) == TEST_SIZE From d00115114d671ff00319cc802c4bc606390c8cd1 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Mon, 31 Jul 2023 15:54:39 +0200 Subject: [PATCH 29/37] import pytest --- test/renate/benchmark/datasets/test_multi_data_nlp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index cd14a307..f69040be 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -1,5 +1,6 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 +import pytest import transformers as transformers from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule From e7985ec43c3b5279ba8a5685a2be96d2d746f57c Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Tue, 1 Aug 2023 16:22:23 +0200 Subject: [PATCH 30/37] remove amazon reviews --- src/renate/benchmark/datasets/nlp_datasets.py | 27 ++++++------------- .../benchmark/datasets/test_multi_data_nlp.py | 3 +-- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 8ded5358..9ee04fd4 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -142,11 +142,10 @@ class MultiTextDataModule(RenateDataModule): """ Inspired by the dataset used in "Episodic Memory in Lifelong Language Learning" by d’Autume et al. this is a collection of five different datasets that we call domains: - AGNews, Yelp, Amazon reviews, DBPedia and Yahoo Answers. + AGNews, Yelp, DBPedia and Yahoo Answers. The output space if the union of the output space of all the domains. - The dataset has 33 classes: 4 from AGNews, 5 from Yelp, 14 from DBPedia, 5 from Amazon reviews, - and 10 from Yahoo. Amazon and Yelp have similar semantics and the classes have been merged. + The dataset has 33 classes: 4 from AGNews, 5 from Yelp, 14 from DBPedia, and 10 from Yahoo. The maximum allowed size for the training set is 115000 and for the test set is 7600. Each domain will have the same fixed size. @@ -197,7 +196,6 @@ def __init__( self._multi_dataset_info = { "ag_news": ["text", "label"], "yelp_review_full": ["text", "label"], - "amazon_reviews_multi": ["review_body", "stars"], "dbpedia_14": ["content", "label"], "yahoo_answers_topics": ["question_title", "topic"], } @@ -206,11 +204,11 @@ def __init__( "ag_news1": 1, "ag_news3": 2, "ag_news2": 3, - "amazon_reviews_multi1": 4, - "amazon_reviews_multi2": 5, - "amazon_reviews_multi3": 6, - "amazon_reviews_multi4": 7, - "amazon_reviews_multi5": 8, + "yelp_review_full0": 4, + "yelp_review_full1": 5, + "yelp_review_full2": 6, + "yelp_review_full3": 7, + "yelp_review_full4": 8, "dbpedia_140": 9, "dbpedia_141": 10, "dbpedia_142": 11, @@ -235,12 +233,6 @@ def __init__( "yahoo_answers_topics7": 30, "yahoo_answers_topics8": 31, "yahoo_answers_topics9": 32, - # yelp gets the same label ids as Amazon reviews - "yelp_review_full0": 4, - "yelp_review_full1": 5, - "yelp_review_full2": 6, - "yelp_review_full3": 7, - "yelp_review_full4": 8, } if domain not in self._multi_dataset_info.keys(): @@ -256,10 +248,7 @@ def prepare_data(self) -> None: """Download dataset.""" for split in ["train", "test"] + (["validation"] if self._val_size > 0 else []): - if "amazon" in self.domain: - load_dataset(self.domain, name="en", split=split, cache_dir=self._data_path) - else: - load_dataset(self.domain, split=split, cache_dir=self._data_path) + load_dataset(self.domain, split=split, cache_dir=self._data_path) def setup(self) -> None: """Set up train, test and val datasets.""" diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index f69040be..58677c73 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -15,7 +15,7 @@ def test_multi_data_nlp_small(tmpdir): tmpdir, train_size=TRAIN_SIZE, test_size=TEST_SIZE, - domain="amazon_reviews_multi", + domain="ag_news", tokenizer=transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased"), ) @@ -43,7 +43,6 @@ def test_multi_data_nlp_full(tmpdir): for d in [ "ag_news", "yelp_review_full", - "amazon_reviews_multi", "dbpedia_14", "yahoo_answers_topics", ]: From ca1c526f0781849661eb30a0e05249fe4e8c772d Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Tue, 1 Aug 2023 17:53:10 +0200 Subject: [PATCH 31/37] fix order agnews labels --- src/renate/benchmark/datasets/nlp_datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 9ee04fd4..e8f7be7b 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -202,8 +202,8 @@ def __init__( self._labels_map = { "ag_news0": 0, "ag_news1": 1, - "ag_news3": 2, - "ag_news2": 3, + "ag_news2": 2, + "ag_news3": 3, "yelp_review_full0": 4, "yelp_review_full1": 5, "yelp_review_full2": 6, From 9f11751f8a8d44b5bbe126ce5a038d5f6bf7ff84 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Tue, 1 Aug 2023 18:57:59 +0200 Subject: [PATCH 32/37] fix test skip reason --- test/renate/benchmark/datasets/test_multi_data_nlp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index 58677c73..ffea44b1 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -35,7 +35,7 @@ def test_multi_data_nlp_small(tmpdir): assert len(te_data_dbpedia) == TEST_SIZE -@pytest.mark.skip(reason="This test requires downloading and processing five datasets.") +@pytest.mark.skip(reason="This test requires downloading and processing four datasets.") def test_multi_data_nlp_full(tmpdir): TRAIN_SIZE = 115000 TEST_SIZE = 7600 From 56bb55a30417d4ddd1bf4c7f7b608ecff8991119 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 3 Aug 2023 14:00:05 +0200 Subject: [PATCH 33/37] improve tests, adapt to data incremental module --- src/renate/benchmark/datasets/nlp_datasets.py | 124 +++++++++--------- src/renate/defaults.py | 4 +- .../benchmark/datasets/test_multi_data_nlp.py | 30 +++-- 3 files changed, 79 insertions(+), 79 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index e8f7be7b..08d46b34 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -10,6 +10,7 @@ from datasets import load_dataset from renate import defaults +from renate.benchmark.datasets.base import DataIncrementalDataModule from renate.data.data_module import RenateDataModule @@ -138,7 +139,7 @@ def tokenize_fn(batch): self._train_data, self._val_data = self._split_train_val_data(self._train_data) -class MultiTextDataModule(RenateDataModule): +class MultiTextDataModule(DataIncrementalDataModule): """ Inspired by the dataset used in "Episodic Memory in Lifelong Language Learning" by d’Autume et al. this is a collection of five different datasets that we call domains: @@ -159,29 +160,69 @@ class MultiTextDataModule(RenateDataModule): See https://huggingface.co/docs/tokenizers/ for more information on tokenizers. If `None` is passed, this defaults to `{"padding": "max_length", max_length: 128, truncation: True}`. - domain: The dataset to be used + data_id: The dataset to be used train_size: The size of the data stored as training set, must be smaller than 115000. test_size: The size of the data stored as test set, must be smaller than 7600. val_size: Fraction of the training data to be used for validation. seed: Seed used to fix random number generation. """ + _multi_dataset_info = { + "ag_news": ["text", "label"], + "yelp_review_full": ["text", "label"], + "dbpedia_14": ["content", "label"], + "yahoo_answers_topics": ["question_title", "topic"], + } + _labels_map = { + "ag_news0": 0, + "ag_news1": 1, + "ag_news2": 2, + "ag_news3": 3, + "yelp_review_full0": 4, + "yelp_review_full1": 5, + "yelp_review_full2": 6, + "yelp_review_full3": 7, + "yelp_review_full4": 8, + "dbpedia_140": 9, + "dbpedia_141": 10, + "dbpedia_142": 11, + "dbpedia_143": 12, + "dbpedia_144": 13, + "dbpedia_145": 14, + "dbpedia_146": 15, + "dbpedia_147": 16, + "dbpedia_148": 17, + "dbpedia_149": 18, + "dbpedia_1410": 19, + "dbpedia_1411": 20, + "dbpedia_1412": 21, + "dbpedia_1413": 22, + "yahoo_answers_topics0": 23, + "yahoo_answers_topics1": 24, + "yahoo_answers_topics2": 25, + "yahoo_answers_topics3": 26, + "yahoo_answers_topics4": 27, + "yahoo_answers_topics5": 28, + "yahoo_answers_topics6": 29, + "yahoo_answers_topics7": 30, + "yahoo_answers_topics8": 31, + "yahoo_answers_topics9": 32, + } + + domains = _multi_dataset_info.keys() + def __init__( self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, - domain: str, + data_id: str, tokenizer_kwargs: Optional[Dict[str, Any]] = None, - train_size: int = defaults.TRAIN_SET_SIZE, - test_size: int = defaults.TEST_SET_SIZE, + train_size: int = defaults.SMALL_TRAIN_SET_SIZE, + test_size: int = defaults.SMALL_TEST_SET_SIZE, val_size: float = defaults.VALIDATION_SIZE, seed: int = defaults.SEED, ): - super().__init__( - data_path=data_path, - val_size=val_size, - seed=seed, - ) + super().__init__(data_path=data_path, data_id=data_id, val_size=val_size, seed=seed) if train_size > 115000: raise ValueError("The `train_size` must be smaller than 115000") @@ -193,62 +234,19 @@ def __init__( self._tokenizer = tokenizer self._tokenizer_kwargs = tokenizer_kwargs or defaults.TOKENIZER_KWARGS - self._multi_dataset_info = { - "ag_news": ["text", "label"], - "yelp_review_full": ["text", "label"], - "dbpedia_14": ["content", "label"], - "yahoo_answers_topics": ["question_title", "topic"], - } - self._labels_map = { - "ag_news0": 0, - "ag_news1": 1, - "ag_news2": 2, - "ag_news3": 3, - "yelp_review_full0": 4, - "yelp_review_full1": 5, - "yelp_review_full2": 6, - "yelp_review_full3": 7, - "yelp_review_full4": 8, - "dbpedia_140": 9, - "dbpedia_141": 10, - "dbpedia_142": 11, - "dbpedia_143": 12, - "dbpedia_144": 13, - "dbpedia_145": 14, - "dbpedia_146": 15, - "dbpedia_147": 16, - "dbpedia_148": 17, - "dbpedia_149": 18, - "dbpedia_1410": 19, - "dbpedia_1411": 20, - "dbpedia_1412": 21, - "dbpedia_1413": 22, - "yahoo_answers_topics0": 23, - "yahoo_answers_topics1": 24, - "yahoo_answers_topics2": 25, - "yahoo_answers_topics3": 26, - "yahoo_answers_topics4": 27, - "yahoo_answers_topics5": 28, - "yahoo_answers_topics6": 29, - "yahoo_answers_topics7": 30, - "yahoo_answers_topics8": 31, - "yahoo_answers_topics9": 32, - } - - if domain not in self._multi_dataset_info.keys(): + + if data_id not in self.domains: raise ValueError( - f"The selected domain is not available. Select one among " - f"{self._multi_dataset_info.keys()}" + f"The selected domain is not available. Select one among " f"{self.domains}" ) - self.domain = domain - self.available_domains = self._multi_dataset_info.keys() + self.data_id = data_id def prepare_data(self) -> None: """Download dataset.""" for split in ["train", "test"] + (["validation"] if self._val_size > 0 else []): - load_dataset(self.domain, split=split, cache_dir=self._data_path) + load_dataset(self.data_id, split=split, cache_dir=self._data_path) def setup(self) -> None: """Set up train, test and val datasets.""" @@ -256,16 +254,16 @@ def setup(self) -> None: def preprocess(example, text_field_name, label_field_name): return { **self._tokenizer(example[text_field_name], **self._tokenizer_kwargs), - "label": self._labels_map[f"{self.domain}{example[label_field_name]}"], + "label": self._labels_map[f"{self.data_id}{example[label_field_name]}"], } def get_split(split_name): - dataset = load_dataset(self.domain, split=split_name, cache_dir=self._data_path) + dataset = load_dataset(self.data_id, split=split_name, cache_dir=self._data_path) new_features = dataset.features.copy() # the following is hack needed because the output space of the new dataset is # the union of the output spaces of the single datasets - new_features[self._multi_dataset_info[self.domain][1]] = datasets.ClassLabel( + new_features[self._multi_dataset_info[self.data_id][1]] = datasets.ClassLabel( num_classes=33 ) @@ -282,8 +280,8 @@ def get_split(split_name): dataset = dataset.map( functools.partial( preprocess, - text_field_name=self._multi_dataset_info[self.domain][0], - label_field_name=self._multi_dataset_info[self.domain][1], + text_field_name=self._multi_dataset_info[self.data_id][0], + label_field_name=self._multi_dataset_info[self.data_id][1], ), remove_columns=list(dataset.features), num_proc=4, diff --git a/src/renate/defaults.py b/src/renate/defaults.py index fb6566d3..7b31583d 100644 --- a/src/renate/defaults.py +++ b/src/renate/defaults.py @@ -104,8 +104,8 @@ # Benchmark datasets/models TOKENIZER_KWARGS = {"padding": "max_length", "max_length": 128, "truncation": True} -TRAIN_SET_SIZE = 1000 -TEST_SET_SIZE = 1000 +SMALL_TRAIN_SET_SIZE = 1000 +SMALL_TEST_SET_SIZE = 1000 def scheduler(config_space: Dict[str, Any], mode: str, metric: str): diff --git a/test/renate/benchmark/datasets/test_multi_data_nlp.py b/test/renate/benchmark/datasets/test_multi_data_nlp.py index ffea44b1..f91765c2 100644 --- a/test/renate/benchmark/datasets/test_multi_data_nlp.py +++ b/test/renate/benchmark/datasets/test_multi_data_nlp.py @@ -1,6 +1,7 @@ # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 import pytest +import torch import transformers as transformers from renate.benchmark.datasets.nlp_datasets import MultiTextDataModule @@ -15,8 +16,9 @@ def test_multi_data_nlp_small(tmpdir): tmpdir, train_size=TRAIN_SIZE, test_size=TEST_SIZE, - domain="ag_news", + data_id="ag_news", tokenizer=transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased"), + seed=42, ) data.prepare_data() @@ -25,8 +27,9 @@ def test_multi_data_nlp_small(tmpdir): assert len(data.train_data()) == TRAIN_SIZE assert len(data.test_data()) == TEST_SIZE - data.domain = "dbpedia_14" - data.prepare_data() + first_input_agnews = data.train_data()[0][0]["input_ids"] + + data.data_id = "dbpedia_14" data.setup() tr_data_dbpedia = data.train_data() @@ -34,30 +37,29 @@ def test_multi_data_nlp_small(tmpdir): assert len(tr_data_dbpedia) == TRAIN_SIZE assert len(te_data_dbpedia) == TEST_SIZE + first_input_dbpedia = data.train_data()[0][0]["input_ids"] + + assert not torch.all(torch.eq(first_input_dbpedia, first_input_agnews)) + @pytest.mark.skip(reason="This test requires downloading and processing four datasets.") def test_multi_data_nlp_full(tmpdir): TRAIN_SIZE = 115000 TEST_SIZE = 7600 - for d in [ - "ag_news", - "yelp_review_full", - "dbpedia_14", - "yahoo_answers_topics", - ]: + for d in MultiTextDataModule.domains: data = MultiTextDataModule( tmpdir, train_size=TRAIN_SIZE, test_size=TEST_SIZE, - domain=d, + data_id=d, tokenizer=transformers.DistilBertTokenizer.from_pretrained("distilbert-base-uncased"), ) data.prepare_data() data.setup() - tr_data_agnews = data.train_data() - te_data_agnews = data.test_data() - assert len(tr_data_agnews) == TRAIN_SIZE - assert len(te_data_agnews) == TEST_SIZE + tr_data = data.train_data() + te_data = data.test_data() + assert len(tr_data) == TRAIN_SIZE + assert len(te_data) == TEST_SIZE From 9c26c1aff2a87730de26e3e4686be2bbe956dc7f Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 3 Aug 2023 15:13:13 +0200 Subject: [PATCH 34/37] add seed randint --- src/renate/benchmark/datasets/nlp_datasets.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 3dd4f3ef..b1cd4230 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -241,6 +241,7 @@ def __init__( ) self.data_id = data_id + self._rnd_gen = torch.Generator().manual_seed(self._seed) def prepare_data(self) -> None: """Download dataset.""" @@ -273,7 +274,9 @@ def get_split(split_name): else: set_size = self._test_size - rnd_idx = torch.randint(low=0, high=len(dataset), size=(set_size,)).tolist() + rnd_idx = torch.randint( + low=0, high=len(dataset), size=(set_size,), generator=self._rnd_gen + ).tolist() dataset = dataset.select(indices=rnd_idx) dataset = dataset.map( From c82285e9a1765933017370dac58f544a07fa8673 Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 3 Aug 2023 15:14:24 +0200 Subject: [PATCH 35/37] fix exception message --- src/renate/benchmark/datasets/nlp_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index b1cd4230..a721accf 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -225,7 +225,7 @@ def __init__( super().__init__(data_path=data_path, data_id=data_id, val_size=val_size, seed=seed) if train_size > 115000: - raise ValueError("The `train_size` must be smaller than 115000") + raise ValueError("The `train_size` must be smaller than or equal to 115000") self._train_size = train_size if test_size > 7600: From 3b009a7b9c82d6f185b0cc82772cb4b77cb4ff8a Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 3 Aug 2023 15:19:31 +0200 Subject: [PATCH 36/37] avoid copying metadata to change num classes --- src/renate/benchmark/datasets/nlp_datasets.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index a721accf..82c05474 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -260,16 +260,13 @@ def preprocess(example, text_field_name, label_field_name): def get_split(split_name): dataset = load_dataset(self.data_id, split=split_name, cache_dir=self._data_path) - new_features = dataset.features.copy() # the following is hack needed because the output space of the new dataset is # the union of the output spaces of the single datasets - new_features[self._multi_dataset_info[self.data_id][1]] = datasets.ClassLabel( + dataset.features[self._multi_dataset_info[self.data_id][1]] = datasets.ClassLabel( num_classes=33 ) - dataset = dataset.cast(new_features) - - if "train" in split_name: + if "train" == split_name: set_size = self._train_size else: set_size = self._test_size From 1ad978ec0759696ea374b28e4d6a6fa77b91e5dc Mon Sep 17 00:00:00 2001 From: Giovanni Zappella Date: Thu, 3 Aug 2023 15:57:16 +0200 Subject: [PATCH 37/37] fix generator and features --- src/renate/benchmark/datasets/nlp_datasets.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/renate/benchmark/datasets/nlp_datasets.py b/src/renate/benchmark/datasets/nlp_datasets.py index 82c05474..5abe1fc8 100644 --- a/src/renate/benchmark/datasets/nlp_datasets.py +++ b/src/renate/benchmark/datasets/nlp_datasets.py @@ -241,7 +241,6 @@ def __init__( ) self.data_id = data_id - self._rnd_gen = torch.Generator().manual_seed(self._seed) def prepare_data(self) -> None: """Download dataset.""" @@ -252,6 +251,8 @@ def prepare_data(self) -> None: def setup(self) -> None: """Set up train, test and val datasets.""" + rnd_gen = torch.Generator().manual_seed(self._seed) + def preprocess(example, text_field_name, label_field_name): return { **self._tokenizer(example[text_field_name], **self._tokenizer_kwargs), @@ -262,17 +263,25 @@ def get_split(split_name): dataset = load_dataset(self.data_id, split=split_name, cache_dir=self._data_path) # the following is hack needed because the output space of the new dataset is # the union of the output spaces of the single datasets - dataset.features[self._multi_dataset_info[self.data_id][1]] = datasets.ClassLabel( + # HF datasets check for the max label id and we need to make sure we update that + # without this change the setup will fail with a value error (label id > max labels) + new_features = dataset.features.copy() + new_features[self._multi_dataset_info[self.data_id][1]] = datasets.ClassLabel( num_classes=33 ) + dataset = dataset.cast(new_features) + if "train" == split_name: set_size = self._train_size else: set_size = self._test_size rnd_idx = torch.randint( - low=0, high=len(dataset), size=(set_size,), generator=self._rnd_gen + low=0, + high=len(dataset), + size=(set_size,), + generator=rnd_gen, ).tolist() dataset = dataset.select(indices=rnd_idx)