-
Notifications
You must be signed in to change notification settings - Fork 0
/
federated_datasets.py
102 lines (87 loc) · 3.69 KB
/
federated_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Copyright (C) 2024 Instituto Andaluz Interuniversitario en Ciencia de Datos e Inteligencia Computacional (DaSCI).
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
import numpy as np
from flex.data import Dataset, FedDataDistribution, FedDatasetConfig
from flex.datasets import standard_datasets
def federated_emnist(out_dir: str = ".", split="digits", return_test=False):
train_data, _ = standard_datasets.emnist(out_dir, split=split, include_authors=True)
config = FedDatasetConfig(
group_by_label_index=1
) # when authoers are included, each label is a tuple (class, writer_id)
federated_data = FedDataDistribution.from_config(train_data, config)
if return_test:
_, test_data = standard_datasets.emnist(
out_dir, split=split, include_authors=False
)
return (federated_data, test_data)
else:
return federated_data
def federated_celeba(out_dir: str = ".", return_test=False):
from torchvision.datasets import CelebA
class ToNumpy:
def __call__(self, data):
if isinstance(data, tuple): # Label
return tuple(np.asarray(i) for i in data)
else:
return np.asarray(data) # Images
dataset = CelebA(
root=out_dir,
split="train",
transform=ToNumpy(),
target_transform=ToNumpy(),
target_type=["identity", "attr"],
download=True,
)
config = FedDatasetConfig(group_by_label_index=0) # identity
federated_data = FedDataDistribution.from_config_with_torchvision_dataset(
dataset, config
)
if return_test:
test_ds = CelebA(
root=out_dir,
split="test",
transform=ToNumpy(),
target_transform=ToNumpy(),
target_type=["identity", "attr"],
download=True,
)
test_data = Dataset.from_torchvision_dataset(test_ds)
return (federated_data, test_data)
return federated_data
def federated_sentiment140(out_dir: str = ".", return_test=False, **kwargs):
from datasets import load_dataset
dataset = load_dataset("sentiment140")
x_labels = ["text"]
y_labels = ["user", "sentiment"]
config = FedDatasetConfig(group_by_label_index=0) # Label "user"
federated_data = FedDataDistribution.from_config_with_huggingface_dataset(
dataset["train"], config, x_labels, y_labels
)
if return_test:
test_data = Dataset.from_huggingface_dataset(
dataset["test"], x_labels, y_labels
)
return (federated_data, test_data)
return federated_data
def federated_shakespeare(out_dir: str = ".", return_test=False):
train_data, _ = standard_datasets.shakespeare(out_dir, include_actors=True)
config = FedDatasetConfig(
group_by_label_index=1
) # each label is a pair (class, actor_id)
federated_data = FedDataDistribution.from_config(train_data, config)
if return_test:
_, test_data = standard_datasets.shakespeare(out_dir, include_actors=False)
return (federated_data, test_data)
else:
return federated_data