diff --git a/CHANGELOG.md b/CHANGELOG.md index 87c5d6f9cb8..46dca6f253c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased ### Added - +- Add `HuggingfaceDatasetSplitReader` for using huggingface datasets in AllenNLP with limited support - Ported the following Huggingface `LambdaLR`-based schedulers: `ConstantLearningRateScheduler`, `ConstantWithWarmupLearningRateScheduler`, `CosineWithWarmupLearningRateScheduler`, `CosineHardRestartsWithWarmupLearningRateScheduler`. ### Changed @@ -264,7 +264,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Added sampler class and parameter in beam search for non-deterministic search, with several implementations, including `MultinomialSampler`, `TopKSampler`, `TopPSampler`, and `GumbelSampler`. Utilizing `GumbelSampler` will give [Stochastic Beam Search](https://api.semanticscholar.org/CorpusID:76662039). - + ### Changed - Pass batch metrics to `BatchCallback`. diff --git a/allennlp/data/dataset_readers/huggingface_datasets_reader.py b/allennlp/data/dataset_readers/huggingface_datasets_reader.py new file mode 100644 index 00000000000..b1cba0a3471 --- /dev/null +++ b/allennlp/data/dataset_readers/huggingface_datasets_reader.py @@ -0,0 +1,222 @@ +from typing import Iterable, Optional + +from allennlp.data import DatasetReader, Token, Field +from allennlp.data.fields import TextField, LabelField, ListField +from allennlp.data.instance import Instance +from datasets import load_dataset, Dataset, DatasetDict +from datasets.features import ClassLabel, Sequence, Translation, TranslationVariableLanguages +from datasets.features import Value + +# TODO pab complete the documentation comments +class HuggingfaceDatasetSplitReader(DatasetReader): + """ + This reader implementation wraps the huggingface datasets package + to utilize it's dataset management functionality and load the information in AllenNLP friendly formats + Note: Reader works w.r.t to only one split of the dataset, + i.e. you would need to create separate reader for separate splits + + Following dataset and configurations have been verified and work with this reader + + Dataset Dataset Configuration + `xnli` `ar` + `xnli` `en` + `xnli` `de` + `xnli` `all_languages` + `glue` `cola` + `glue` `mrpc` + `glue` `sst2` + `glue` `qqp` + `glue` `mnli` + `glue` `mnli_matched` + `universal_dependencies` `en_lines` + `universal_dependencies` `ko_kaist` + `universal_dependencies` `af_afribooms` + `afrikaans_ner_corpus` `NA` + `swahili` `NA` + `conll2003` `NA` + `dbpedia_14` `NA` + `trec` `NA` + `emotion` `NA` + + # Parameters + dataset_name : `str` + config_name : `str`, optional (default=`None`) + pre_load : `bool`, optional (default='False`) + """ + + def __init__( + self, + max_instances: Optional[int] = None, + manual_distributed_sharding: bool = False, + manual_multiprocess_sharding: bool = False, + serialization_dir: Optional[str] = None, + dataset_name: str = None, + config_name: Optional[str] = None, + pre_load: Optional[bool] = False + ) -> None: + super().__init__( + max_instances, + manual_distributed_sharding, + manual_multiprocess_sharding, + serialization_dir, + ) + + # It would be cleaner to create a separate reader object for different dataset + self.dataset: Dataset = None + self.datasets: DatasetDict = DatasetDict() + self.dataset_name = dataset_name + self.config_name = config_name + self.index = -1 + + if pre_load: + load_dataset() + + def load_dataset(self): + if self.config_name is not None: + self.datasets = load_dataset(self.dataset_name, self.config_name) + else: + self.datasets = load_dataset(self.dataset_name) + + def load_dataset_split(self, split): + if self.config_name is not None: + self.datasets[split] = load_dataset(self.dataset_name, self.config_name, split=split) + else: + self.datasets[split] = load_dataset(self.dataset_name, split=split) + + def _read(self, file_path) -> Iterable[Instance]: + """ + Reads the dataset and converts the entry to AllenNLP friendly instance + """ + if file_path not in self.datasets: + self.load_dataset_split(file_path) + + if self.datasets is not None and self.datasets[file_path] is not None: + for entry in self.datasets[file_path]: + yield self.text_to_instance(entry) + + def raise_feature_not_supported_value_error(self, value): + raise ValueError(f"Datasets feature type {type(value)} is not supported yet.") + + def text_to_instance(self, *inputs) -> Instance: + """ + Takes care of converting dataset entry into AllenNLP friendly instance + Currently it is implemented in an unseemly catch-up model + where it converts datasets.features that are required for the supported dataset, + ideally it would require design where we cleanly deliberate, decide + map dataset.feature to an allenlp.data.field and then go ahead with converting it + Doing that would provide the best chance of providing largest possible coverage with datasets + + Currently this is how datasets.features types are mapped to AllenNLP Fields + + dataset.feature type allennlp.data.fields + `ClassLabel` `LabelField` in feature name namespace + `Value.string` `TextField` with value as Token + `Value.*` `LabelField` with value being label in feature name namespace + `Sequence.string` `ListField` of `TextField` with individual string as token + `Sequence.ClassLabel` `ListField` of `ClassLabel` in feature name namespace + `Translation` `ListField` of 2 ListField (ClassLabel and TextField) + `TranslationVariableLanguages` `ListField` of 2 ListField (ClassLabel and TextField) + """ + + # features indicate the different information available in each entry from dataset + # feature types decide what type of information they are + # e.g. In a Sentiment dataset an entry could have one feature (of type text/string) indicating the text + # and another indicate the sentiment (of typeint32/ClassLabel) + if self.dataset is not None and self.dataset.features is not None: + features = self.dataset.features + fields = dict() + + # TODO we need to support all different datasets features described + # in https://huggingface.co/docs/datasets/features.html + for feature in features: + field: Field + item_field: Field + field_list: list + value = features[feature] + + # datasets ClassLabel maps to LabelField + if isinstance(value, ClassLabel): + field = LabelField(inputs[0][feature], label_namespace=feature, skip_indexing=True) + + # datasets Value can be of different types + elif isinstance(value, Value): + + # String value maps to TextField + if value.dtype == "string": + # Since TextField has to be made of Tokens add whole text as a token + # TODO Should we use simple heuristics to identify what is token and what is not? + field = TextField([Token(inputs[0][feature])]) + + else: + field = LabelField( + inputs[0][feature], label_namespace=feature, skip_indexing=True + ) + + elif isinstance(value, Sequence): + # datasets Sequence of strings to ListField of TextField + if value.feature.dtype == "string": + field_list = list() + for item in inputs[0][feature]: + item_field = TextField([Token(item)]) + field_list.append(item_field) + + if len(field_list) == 0: + continue + field = ListField(field_list) + + # datasets Sequence of strings to ListField of LabelField + elif isinstance(value.feature, ClassLabel): + field_list = list() + for item in inputs[0][feature]: + item_field = LabelField( + label=item, label_namespace=feature, skip_indexing=True + ) + field_list.append(item_field) + if len(field_list) == 0: + continue + field = ListField(field_list) + + else: + self.raise_feature_not_supported_value_error(value) + + + # datasets.Translation cannot be mapped directly + # but it's dict structure can be mapped to a ListField of 2 ListField + elif isinstance(value, Translation): + if value.dtype == "dict": + input_dict = inputs[0][feature] + langs = list(input_dict.keys()) + field_langs = [LabelField(lang, label_namespace="languages") for lang in langs] + langs_field = ListField(field_langs) + texts = list() + for lang in langs: + texts.append(TextField([Token(input_dict[lang])])) + field = ListField([langs_field, ListField(texts)]) + + else: + raise ValueError(f"Datasets feature type {type(value)} is not supported yet.") + + # datasets.TranslationVariableLanguages + # is functionally a pair of Lists and hence mapped to a ListField of 2 ListField + elif isinstance(value, TranslationVariableLanguages): + if value.dtype == "dict": + input_dict = inputs[0][feature] + langs = input_dict["language"] + field_langs = [LabelField(lang, label_namespace="languages") for lang in langs] + langs_field = ListField(field_langs) + texts = list() + for lang in langs: + index = langs.index(lang) + texts.append(TextField([Token(input_dict["translation"][index])])) + field = ListField([langs_field, ListField(texts)]) + + else: + raise ValueError(f"Datasets feature type {type(value)} is not supported yet.") + + else: + raise ValueError(f"Datasets feature type {type(value)} is not supported yet.") + + if field: + fields[feature] = field + + return Instance(fields) diff --git a/setup.py b/setup.py index 40655093c83..2e764348398 100644 --- a/setup.py +++ b/setup.py @@ -73,6 +73,7 @@ "lmdb", "more-itertools", "wandb>=0.10.0,<0.11.0", + "datasets>=1.5.0,<1.6.0", ], entry_points={"console_scripts": ["allennlp=allennlp.__main__:run"]}, include_package_data=True, diff --git a/tests/data/dataset_readers/huggingface_datasets_test.py b/tests/data/dataset_readers/huggingface_datasets_test.py new file mode 100644 index 00000000000..688dc2d110a --- /dev/null +++ b/tests/data/dataset_readers/huggingface_datasets_test.py @@ -0,0 +1,21 @@ +import pytest + +from allennlp.data.dataset_readers.huggingface_datasets_reader import HuggingfaceDatasetSplitReader +import logging + +logger = logging.getLogger(__name__) + + +# TODO these UTs are actually downloading the datasets and will be very very slow +# TODO add UT were we compare huggingface wrapped reader with an explicitly coded builder +class HuggingfaceDatasetSplitReaderTest: + + """ + Running the tests for supported datasets which require config name to be specified + """ + @pytest.mark.parametrize("dataset, config, split", (("glue", "cola", "train"), ("glue", "cola", "test"))) + def test_read_for_datasets_requiring_config(self, dataset, config, split): + huggingface_reader = HuggingfaceDatasetSplitReader(dataset_name=dataset, config_name=config) + instances = list(huggingface_reader.read(split)) + assert len(instances) == len(huggingface_reader.datasets[split]) + print(instances[0], print(huggingface_reader.datasets[split][0]))