From 977c0b2b0c67075bc96e4c061e71825b7b616b99 Mon Sep 17 00:00:00 2001 From: "Abhishek P (VMware)" Date: Wed, 7 Apr 2021 21:38:27 +0530 Subject: [PATCH] Signed-off-by: Abhishek P (VMware) Converted HFDatasetSplitReader to HFDatasetReader Now all splits can be used in the same reader Support for both pre-load of all splits or on demand load of the split Reduced tests to glue-cola dataset:config which is ~ 0.36MB download Updated dataset dep to be the range of >=1.5.0 and <1.6.0 --- allennlp/data/dataset_readers/huggingface_datasets_reader.py | 2 +- tests/data/dataset_readers/huggingface_datasets_test.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/allennlp/data/dataset_readers/huggingface_datasets_reader.py b/allennlp/data/dataset_readers/huggingface_datasets_reader.py index b1cba0a3471..a1d4d5a02da 100644 --- a/allennlp/data/dataset_readers/huggingface_datasets_reader.py +++ b/allennlp/data/dataset_readers/huggingface_datasets_reader.py @@ -8,7 +8,7 @@ from datasets.features import Value # TODO pab complete the documentation comments -class HuggingfaceDatasetSplitReader(DatasetReader): +class HuggingfaceDatasetReader(DatasetReader): """ This reader implementation wraps the huggingface datasets package to utilize it's dataset management functionality and load the information in AllenNLP friendly formats diff --git a/tests/data/dataset_readers/huggingface_datasets_test.py b/tests/data/dataset_readers/huggingface_datasets_test.py index 688dc2d110a..7c47557ceac 100644 --- a/tests/data/dataset_readers/huggingface_datasets_test.py +++ b/tests/data/dataset_readers/huggingface_datasets_test.py @@ -1,6 +1,6 @@ import pytest -from allennlp.data.dataset_readers.huggingface_datasets_reader import HuggingfaceDatasetSplitReader +from allennlp.data.dataset_readers.huggingface_datasets_reader import HuggingfaceDatasetReader import logging logger = logging.getLogger(__name__) @@ -15,7 +15,7 @@ class HuggingfaceDatasetSplitReaderTest: """ @pytest.mark.parametrize("dataset, config, split", (("glue", "cola", "train"), ("glue", "cola", "test"))) def test_read_for_datasets_requiring_config(self, dataset, config, split): - huggingface_reader = HuggingfaceDatasetSplitReader(dataset_name=dataset, config_name=config) + huggingface_reader = HuggingfaceDatasetReader(dataset_name=dataset, config_name=config) instances = list(huggingface_reader.read(split)) assert len(instances) == len(huggingface_reader.datasets[split]) print(instances[0], print(huggingface_reader.datasets[split][0]))