-
Notifications
You must be signed in to change notification settings - Fork 0
/
pluggable_datasets.py
158 lines (127 loc) · 6.41 KB
/
pluggable_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
"""
Copyright (C) 2024 Instituto Andaluz Interuniversitario en Ciencia de Datos e Inteligencia Computacional (DaSCI).
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
from enum import Enum, EnumMeta
class PluggableDataset(EnumMeta):
def __contains__(self, item):
return item in list(self.members())
class PluggableDatasetString(EnumMeta):
def __contains__(self, item):
return item in [k.value[0] for k in self.__members__.values()]
class PluggableTorchtext(Enum, metaclass=PluggableDataset):
"""Class containing all the pluggable datasets to a Dataset without any preprocessing needed.
Any other dataset from the TorchText library will need further preprocessing.
Args:
-----
Enum (enum): torchtext class for each dataset than can be accepted on our platform.
"""
def members():
from torchtext import datasets
yield datasets.AG_NEWS.__name__
yield datasets.AmazonReviewFull.__name__
yield datasets.AmazonReviewPolarity.__name__
yield datasets.DBpedia.__name__
yield datasets.YahooAnswers.__name__
yield datasets.YelpReviewFull.__name__
yield datasets.YelpReviewPolarity.__name__
class PluggableTorchvision(Enum, metaclass=PluggableDataset):
"""Class containing all the pluggable datasets to a Dataset without any preprocessing needed.
Any other dataset from the Torchvision library will need further preprocessing.
Args:
-----
Enum (enum): torchvision class for each dataset than can be accepted on our platform.
"""
def members():
from torchvision import datasets
yield datasets.WIDERFace.__name__
yield datasets.Food101.__name__
yield datasets.CelebA.__name__
yield datasets.CLEVRClassification.__name__
yield datasets.Country211.__name__
yield datasets.FGVCAircraft.__name__
yield datasets.GTSRB.__name__
yield datasets.Kitti.__name__
yield datasets.Flowers102.__name__
yield datasets.StanfordCars.__name__
yield datasets.LFWPeople.__name__
yield datasets.Caltech256.__name__
yield datasets.EuroSAT.__name__
yield datasets.CIFAR10.__name__
yield datasets.CIFAR100.__name__
yield datasets.MNIST.__name__
yield datasets.SUN397.__name__
yield datasets.SEMEION.__name__
yield datasets.Omniglot.__name__
yield datasets.KMNIST.__name__
yield datasets.FashionMNIST.__name__
yield datasets.OxfordIIITPet.__name__
yield datasets.STL10.__name__
yield datasets.PCAM.__name__
yield datasets.Caltech101.__name__
yield datasets.QMNIST.__name__
yield datasets.SVHN.__name__
yield datasets.DTD.__name__
yield datasets.USPS.__name__
yield datasets.RenderedSST2.__name__
yield datasets.INaturalist.__name__
yield datasets.EMNIST.__name__
class PluggableHuggingFace(Enum, metaclass=PluggableDatasetString):
"""Class containing some datasets that can be loaded to FLEXible. Other datasets
can be plugged in, but it requires a special configuration, i.e., glue-cola. This
is more about the user using correctly the arguments on the load_dataset function
from huggingface datasets than a problem of our platform, so the user can easy-use
other datasets.
We show some example datasets that can be loaded using the function
FedDataDistribution.from_config_with_huggingface_dataset just giving a config
and the string associated to each dataset from the Enum defined.
We selected this dataset as we can automatice the process of loading this datasets,
but our framework support almost all the datasets, as they can be loaded as numpy
arrays. We only show supports to this datasets as we can load the dataset
as follows: dataset = load_dataset(name, split='train').
There are some datasets that need extra parameters like the version of the dataset,
or that don't have any split. This must be used by the user previously to load
the dataset into FLEXible, but it will be easy and fast, as the user just
need to select the X_train-y_train as np.arrays.
Args:
-----
Enum (enum): Tuple containing name, X_columns and y_columns to use in the
load_dataset function.
"""
IMDB_HF = ("imdb", "text", "label")
AG_NEWS_HF = ("ag_news", "text", "label")
TWEET_EVAL_EMOJI_HF = ("tweet_eval", "text", "label") # We support all subsets
ROTTEN_TOMATOES_HF = ("rotten_tomatoes", "text", "label")
GLUE_COLA_HF = ("glue", "sentence", "label")
FINANCIAL_PHRASEBANK_HF = (
"financial_phrasebank",
"sentence",
"label",
) # We support all subsets"
SQUAD_HF = ("squad", ["context", "question"], "answers")
APPREVIEWS_HF = ("app_reviews", "review", "star")
AMAZON_POLARITY_HF = ("amazon_polarity", ["title", "content"], "label")
# class PluggableDatasetsTensorFlowText(Enum, metaclass=PluggableDataset):
# """Class containing some datasets that can be loaded to FLEXible. Other datasets
# can be plugged in, but it requires a special configuration, i.e., glue-cola. This
# is more about the user using correctly the arguments on the load_dataset function
# from huggingface datasets than a problem of our platform, so the user can easy-use
# other datasets.
# Args:
# Enum (enum): Tuple containing name, X_columns and y_columns to use in the
# load_dataset function.
# """
# AG_NEWS_TF = ("ag_news_subset", ["title", "description"], ["label"])
# GLUE_TF = ("glue", ["sentence"], ["label"])
# ASSET_TF = ("asset", ["original"], ["simplifications"])
# SQUAD_TF = ("squad", ["title", "question", "context"], ["answers"]) # Wonk work
# COQA_TF = ("coqa", ["questions", "source", "story"], ["answers"]) # Won't work