From a726b7f02ee48e92e3c784a03670ee4c2446cf5a Mon Sep 17 00:00:00 2001 From: cuiboyuan Date: Sat, 7 May 2022 19:49:53 -0400 Subject: [PATCH 1/8] Add CelebA dataset and tests --- celeba_test.py | 22 ++++++++++++++ plato/datasources/celeba.py | 59 +++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 celeba_test.py create mode 100644 plato/datasources/celeba.py diff --git a/celeba_test.py b/celeba_test.py new file mode 100644 index 000000000..f926e2a42 --- /dev/null +++ b/celeba_test.py @@ -0,0 +1,22 @@ +import torch +from plato.datasources import celeba + +if __name__ == '__main__': + ds = celeba.DataSource() + total_num = 0 + train_loader = torch.utils.data.DataLoader(ds.trainset, batch_size=1) + test_loader = torch.utils.data.DataLoader(ds.testset, batch_size=1) + + all_data = None + for batch_id, (examples, labels) in enumerate(train_loader): + if all_data is None: + all_data = examples + else: + all_data = torch.cat((all_data, examples), 0) + + for batch_id, (examples, labels) in enumerate(test_loader): + all_data = torch.cat((all_data, examples), 0) + + print(all_data.shape) + print(torch.mean(all_data, [0, 2, 3])) + print(torch.std(all_data, [0, 2, 3])) diff --git a/plato/datasources/celeba.py b/plato/datasources/celeba.py new file mode 100644 index 000000000..b91a74c4a --- /dev/null +++ b/plato/datasources/celeba.py @@ -0,0 +1,59 @@ +""" +The CelebA dataset from the torchvision package. +""" + +from torchvision import datasets, transforms + +import zipfile +import os +import logging +from plato.config import Config +from plato.datasources import base + + +class CelebA(datasets.CelebA): + + def _check_integrity(self): + return True + + +class DataSource(base.DataSource): + """The CelebA dataset.""" + + def __init__(self): + super().__init__() + _path = Config().data.data_path + + DataSource.download_celeba(_path) + + _transform = transforms.Compose([transforms.ToTensor()]) + self.trainset = CelebA(root=_path, + split='train', + target_type=['attr', 'identity'], + download=False, + transform=_transform) + self.testset = CelebA(root=_path, + split='test', + target_type=['attr', 'identity'], + download=False, + transform=_transform) + + @staticmethod + def download_celeba(root_path): + """ Download and unzip all CelebA data points. """ + datapath = os.path.join(root_path, "celeba") + filename = os.path.join(datapath, "img_align_celeba.zip") + extracted_path, _ = os.path.splitext(filename) + if not os.path.exists(extracted_path): + logging.info("Extracting all images in %s to %s.", + "img_align_celeba.zip", extracted_path) + with zipfile.ZipFile(filename, 'r') as zip_ref: + zip_ref.extractall(datapath) + else: + logging.info("Path %s already exists.", extracted_path) + + def num_train_examples(self): + return 162770 + + def num_test_examples(self): + return 19962 From 324c86945d6902f039c0141f0fabd3685c070557 Mon Sep 17 00:00:00 2001 From: cuiboyuan Date: Sat, 7 May 2022 21:46:43 -0400 Subject: [PATCH 2/8] Update CelebA impl and remove temp tests --- celeba_test.py | 22 ------------- plato/datasources/celeba.py | 62 +++++++++++++++++-------------------- 2 files changed, 29 insertions(+), 55 deletions(-) delete mode 100644 celeba_test.py diff --git a/celeba_test.py b/celeba_test.py deleted file mode 100644 index f926e2a42..000000000 --- a/celeba_test.py +++ /dev/null @@ -1,22 +0,0 @@ -import torch -from plato.datasources import celeba - -if __name__ == '__main__': - ds = celeba.DataSource() - total_num = 0 - train_loader = torch.utils.data.DataLoader(ds.trainset, batch_size=1) - test_loader = torch.utils.data.DataLoader(ds.testset, batch_size=1) - - all_data = None - for batch_id, (examples, labels) in enumerate(train_loader): - if all_data is None: - all_data = examples - else: - all_data = torch.cat((all_data, examples), 0) - - for batch_id, (examples, labels) in enumerate(test_loader): - all_data = torch.cat((all_data, examples), 0) - - print(all_data.shape) - print(torch.mean(all_data, [0, 2, 3])) - print(torch.std(all_data, [0, 2, 3])) diff --git a/plato/datasources/celeba.py b/plato/datasources/celeba.py index b91a74c4a..588c8c012 100644 --- a/plato/datasources/celeba.py +++ b/plato/datasources/celeba.py @@ -2,19 +2,25 @@ The CelebA dataset from the torchvision package. """ +from typing import Callable, List, Optional, Union from torchvision import datasets, transforms - -import zipfile -import os -import logging from plato.config import Config from plato.datasources import base class CelebA(datasets.CelebA): - def _check_integrity(self): - return True + def __init__(self, + root: str, + split: str = "train", + target_type: Union[List[str], str] = "attr", + transform: Optional[Callable] = None, + target_transform: Optional[Callable] = None, + download: bool = False) -> None: + super().__init__(root, split, target_type, transform, target_transform, + download) + self.targets = self.identity.flatten().tolist() + self.classes = [f'Celebrity #{i}' for i in range(10177 + 1)] class DataSource(base.DataSource): @@ -24,33 +30,23 @@ def __init__(self): super().__init__() _path = Config().data.data_path - DataSource.download_celeba(_path) - - _transform = transforms.Compose([transforms.ToTensor()]) - self.trainset = CelebA(root=_path, - split='train', - target_type=['attr', 'identity'], - download=False, - transform=_transform) - self.testset = CelebA(root=_path, - split='test', - target_type=['attr', 'identity'], - download=False, - transform=_transform) - - @staticmethod - def download_celeba(root_path): - """ Download and unzip all CelebA data points. """ - datapath = os.path.join(root_path, "celeba") - filename = os.path.join(datapath, "img_align_celeba.zip") - extracted_path, _ = os.path.splitext(filename) - if not os.path.exists(extracted_path): - logging.info("Extracting all images in %s to %s.", - "img_align_celeba.zip", extracted_path) - with zipfile.ZipFile(filename, 'r') as zip_ref: - zip_ref.extractall(datapath) - else: - logging.info("Path %s already exists.", extracted_path) + _transform = transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), + ]) + + self.trainset = CelebA( + root=_path, + split='train', + target_type=['attr', 'identity', 'bbox', 'landmarks'], + download=True, + transform=_transform) + self.testset = CelebA( + root=_path, + split='test', + target_type=['attr', 'identity', 'bbox', 'landmarks'], + download=True, + transform=_transform) def num_train_examples(self): return 162770 From d3cf45934785a433d172b3fcf2c2848243fe9f6c Mon Sep 17 00:00:00 2001 From: cuiboyuan Date: Sun, 8 May 2022 13:12:16 -0400 Subject: [PATCH 3/8] Update CelebA transforms and add some docs --- plato/datasources/celeba.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/plato/datasources/celeba.py b/plato/datasources/celeba.py index 588c8c012..20aaa7435 100644 --- a/plato/datasources/celeba.py +++ b/plato/datasources/celeba.py @@ -9,6 +9,11 @@ class CelebA(datasets.CelebA): + """ + A wrapper class of torchvision's CelebA dataset class + to add and attributes as celebrity + identity, which is used for non-IID samplers. + """ def __init__(self, root: str, @@ -30,23 +35,24 @@ def __init__(self): super().__init__() _path = Config().data.data_path + image_size = 64 _transform = transforms.Compose([ + transforms.Resize(image_size), + transforms.CenterCrop(image_size), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]) - self.trainset = CelebA( - root=_path, - split='train', - target_type=['attr', 'identity', 'bbox', 'landmarks'], - download=True, - transform=_transform) - self.testset = CelebA( - root=_path, - split='test', - target_type=['attr', 'identity', 'bbox', 'landmarks'], - download=True, - transform=_transform) + self.trainset = CelebA(root=_path, + split='train', + target_type=['attr', 'identity'], + download=True, + transform=_transform) + self.testset = CelebA(root=_path, + split='test', + target_type=['attr', 'identity'], + download=True, + transform=_transform) def num_train_examples(self): return 162770 From 59111b540e33ac13b0cb814a9edbed5f4ea6d5d6 Mon Sep 17 00:00:00 2001 From: cuiboyuan Date: Sun, 8 May 2022 14:38:09 -0400 Subject: [PATCH 4/8] Add CelebA to registry and add target transform to CelebA dataset --- plato/datasources/celeba.py | 26 ++++++++++++++++++++++++-- plato/datasources/registry.py | 4 +++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/plato/datasources/celeba.py b/plato/datasources/celeba.py index 20aaa7435..500503e1c 100644 --- a/plato/datasources/celeba.py +++ b/plato/datasources/celeba.py @@ -2,6 +2,7 @@ The CelebA dataset from the torchvision package. """ +import torch from typing import Callable, List, Optional, Union from torchvision import datasets, transforms from plato.config import Config @@ -47,12 +48,33 @@ def __init__(self): split='train', target_type=['attr', 'identity'], download=True, - transform=_transform) + transform=_transform, + target_transform=DataSource._target_transform) self.testset = CelebA(root=_path, split='test', target_type=['attr', 'identity'], download=True, - transform=_transform) + transform=_transform, + target_transform=DataSource._target_transform) + + @staticmethod + def _target_transform(label): + """ + Output labels are in a tuple of tensors if specified more + than one target types, so we need to convert the tuple to + tensors. Here, we just merge two tensors by adding identity + as the 41st attribute + """ + attr, identity = label + return torch.cat((attr.reshape([ + -1, + ]), identity.reshape([ + -1, + ]))) + + @staticmethod + def input_shape(): + return [162770, 2, 64, 64] def num_train_examples(self): return 162770 diff --git a/plato/datasources/registry.py b/plato/datasources/registry.py index 0b97642fd..d1e3498fd 100644 --- a/plato/datasources/registry.py +++ b/plato/datasources/registry.py @@ -29,7 +29,8 @@ else: from plato.datasources import (mnist, fashion_mnist, emnist, cifar10, cinic10, huggingface, pascal_voc, - tiny_imagenet, femnist, feature, qoenflx) + tiny_imagenet, femnist, feature, qoenflx, + celeba) registered_datasources = OrderedDict([ ('MNIST', mnist), @@ -42,6 +43,7 @@ ('TinyImageNet', tiny_imagenet), ('Feature', feature), ('QoENFLX', qoenflx), + ('CelebA', celeba), ]) registered_partitioned_datasources = OrderedDict([('FEMNIST', femnist)]) From 96df41e86267bc66226aaf1fbcfad45713eadc05 Mon Sep 17 00:00:00 2001 From: cuiboyuan Date: Sun, 8 May 2022 21:48:02 -0400 Subject: [PATCH 5/8] Add yaml file for CelebA testing and update ResNet to support custom number of classes --- configs/CelebA/fedavg_resnet18.yml | 74 ++++++++++++++++++++++++++++++ plato/datasources/celeba.py | 36 +++++++++++---- plato/models/resnet.py | 16 +++++-- 3 files changed, 111 insertions(+), 15 deletions(-) create mode 100644 configs/CelebA/fedavg_resnet18.yml diff --git a/configs/CelebA/fedavg_resnet18.yml b/configs/CelebA/fedavg_resnet18.yml new file mode 100644 index 000000000..d5160c2d2 --- /dev/null +++ b/configs/CelebA/fedavg_resnet18.yml @@ -0,0 +1,74 @@ +clients: + # Type + type: simple + + # The total number of clients + total_clients: 3 + + # The number of clients selected in each round + per_round: 1 + + # Should the clients compute test accuracy locally? + do_test: false + +server: + address: 127.0.0.1 + port: 8000 + +data: + # The training and testing dataset + datasource: CelebA + + # Only add face identity as labels for training + celeba_targets: + attr: false + identity: true + + # Number of identity in CelebA + num_classes: 10178 + + # Where the dataset is located + data_path: ./data + + # Number of samples in each partition + partition_size: 20000 + + # IID or non-IID? + sampler: iid + + # The concentration parameter for the Dirichlet distribution + concentration: 0.5 + + # The random seed for sampling data + random_seed: 1 + +trainer: + # The type of the trainer + type: basic + + # The maximum number of training rounds + rounds: 5 + + # Whether the training should use multiple GPUs if available + parallelized: false + + # The maximum number of clients running concurrently + max_concurrency: 3 + + # The target accuracy + target_accuracy: 0.94 + + # Number of epoches for local training in each communication round + epochs: 5 + batch_size: 32 + optimizer: SGD + learning_rate: 0.01 + momentum: 0.9 + weight_decay: 0.0 + + # The machine learning model + model_name: resnet_18 + +algorithm: + # Aggregation algorithm + type: fedavg diff --git a/plato/datasources/celeba.py b/plato/datasources/celeba.py index 500503e1c..3640dde9d 100644 --- a/plato/datasources/celeba.py +++ b/plato/datasources/celeba.py @@ -36,7 +36,17 @@ def __init__(self): super().__init__() _path = Config().data.data_path - image_size = 64 + target_types = [] + if hasattr(Config().data, "celeba_targets"): + targets = Config().data.celeba_targets + if hasattr(targets, "attr") and targets.attr: + target_types.append("attr") + if hasattr(targets, "identity") and targets.identity: + target_types.append("identity") + else: + target_types = ['attr', 'identity'] + + image_size = 32 _transform = transforms.Compose([ transforms.Resize(image_size), transforms.CenterCrop(image_size), @@ -46,13 +56,13 @@ def __init__(self): self.trainset = CelebA(root=_path, split='train', - target_type=['attr', 'identity'], + target_type=target_types, download=True, transform=_transform, target_transform=DataSource._target_transform) self.testset = CelebA(root=_path, split='test', - target_type=['attr', 'identity'], + target_type=target_types, download=True, transform=_transform, target_transform=DataSource._target_transform) @@ -65,16 +75,22 @@ def _target_transform(label): tensors. Here, we just merge two tensors by adding identity as the 41st attribute """ - attr, identity = label - return torch.cat((attr.reshape([ - -1, - ]), identity.reshape([ - -1, - ]))) + if isinstance(label, tuple): + if len(label) == 1: + return label[0] + elif len(label) == 2: + attr, identity = label + return torch.cat((attr.reshape([ + -1, + ]), identity.reshape([ + -1, + ]))) + else: + return label @staticmethod def input_shape(): - return [162770, 2, 64, 64] + return [162770, 3, 32, 32] def num_train_examples(self): return 162770 diff --git a/plato/models/resnet.py b/plato/models/resnet.py index 7d712d6b8..f15b7df76 100644 --- a/plato/models/resnet.py +++ b/plato/models/resnet.py @@ -9,6 +9,8 @@ import torch.nn as nn import torch.nn.functional as F +from plato.config import Config + class BasicBlock(nn.Module): expansion = 1 @@ -176,13 +178,17 @@ def get_model(model_type): resnet_type = int(model_type.split('_')[1]) + num_classes = 10 + if hasattr(Config().data, 'num_classes'): + num_classes = Config().data.num_classes + if resnet_type == 18: - return Model(BasicBlock, [2, 2, 2, 2]) + return Model(BasicBlock, [2, 2, 2, 2], num_classes) elif resnet_type == 34: - return Model(BasicBlock, [3, 4, 6, 3]) + return Model(BasicBlock, [3, 4, 6, 3], num_classes) elif resnet_type == 50: - return Model(Bottleneck, [3, 4, 6, 3]) + return Model(Bottleneck, [3, 4, 6, 3], num_classes) elif resnet_type == 101: - return Model(Bottleneck, [3, 4, 23, 3]) + return Model(Bottleneck, [3, 4, 23, 3], num_classes) elif resnet_type == 152: - return Model(Bottleneck, [3, 8, 36, 3]) + return Model(Bottleneck, [3, 8, 36, 3], num_classes) From af07466e7d99824bc8d27e1d9c6e74602ab80814 Mon Sep 17 00:00:00 2001 From: cuiboyuan Date: Thu, 12 May 2022 12:05:21 -0400 Subject: [PATCH 6/8] Download celeba from our own web server --- plato/datasources/celeba.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/plato/datasources/celeba.py b/plato/datasources/celeba.py index 3640dde9d..521b2c076 100644 --- a/plato/datasources/celeba.py +++ b/plato/datasources/celeba.py @@ -1,10 +1,13 @@ """ The CelebA dataset from the torchvision package. """ +import logging +import os +from typing import Callable, List, Optional, Union import torch -from typing import Callable, List, Optional, Union from torchvision import datasets, transforms + from plato.config import Config from plato.datasources import base @@ -36,6 +39,13 @@ def __init__(self): super().__init__() _path = Config().data.data_path + if not os.path.exists(os.path.join(_path, 'celeba')): + celeba_url = '' + DataSource.download(celeba_url, _path) + else: + logging.info("CelebA data already decompressed under %s", + os.path.join(_path, 'celeba')) + target_types = [] if hasattr(Config().data, "celeba_targets"): targets = Config().data.celeba_targets @@ -57,13 +67,13 @@ def __init__(self): self.trainset = CelebA(root=_path, split='train', target_type=target_types, - download=True, + download=False, transform=_transform, target_transform=DataSource._target_transform) self.testset = CelebA(root=_path, split='test', target_type=target_types, - download=True, + download=False, transform=_transform, target_transform=DataSource._target_transform) From 0d9c713501148288e060f8a39866f4155b79686f Mon Sep 17 00:00:00 2001 From: cuiboyuan Date: Thu, 12 May 2022 12:34:55 -0400 Subject: [PATCH 7/8] Provide working CelebA download URL --- configs/CelebA/fedavg_resnet18.yml | 5 +++-- plato/datasources/celeba.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/configs/CelebA/fedavg_resnet18.yml b/configs/CelebA/fedavg_resnet18.yml index d5160c2d2..ee3db1e3c 100644 --- a/configs/CelebA/fedavg_resnet18.yml +++ b/configs/CelebA/fedavg_resnet18.yml @@ -21,6 +21,7 @@ data: # Only add face identity as labels for training celeba_targets: + # For ResNet, do not set to True since it does not match the expected output of ResNet attr: false identity: true @@ -34,7 +35,7 @@ data: partition_size: 20000 # IID or non-IID? - sampler: iid + sampler: noniid # The concentration parameter for the Dirichlet distribution concentration: 0.5 @@ -50,7 +51,7 @@ trainer: rounds: 5 # Whether the training should use multiple GPUs if available - parallelized: false + parallelized: true # The maximum number of clients running concurrently max_concurrency: 3 diff --git a/plato/datasources/celeba.py b/plato/datasources/celeba.py index 521b2c076..d94f23173 100644 --- a/plato/datasources/celeba.py +++ b/plato/datasources/celeba.py @@ -40,7 +40,7 @@ def __init__(self): _path = Config().data.data_path if not os.path.exists(os.path.join(_path, 'celeba')): - celeba_url = '' + celeba_url = 'http://iqua.ece.toronto.edu/baochun/celeba.tar.gz' DataSource.download(celeba_url, _path) else: logging.info("CelebA data already decompressed under %s", From 1e8d332fbe4851ce4ca854a5993d19f1896bd97f Mon Sep 17 00:00:00 2001 From: cuiboyuan Date: Thu, 12 May 2022 12:39:52 -0400 Subject: [PATCH 8/8] Update CINIC-10 download url --- configs/CINIC10/fedavg_vgg16.yml | 2 +- plato/datasources/cinic10.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/CINIC10/fedavg_vgg16.yml b/configs/CINIC10/fedavg_vgg16.yml index 575c6791e..05bcd77b4 100644 --- a/configs/CINIC10/fedavg_vgg16.yml +++ b/configs/CINIC10/fedavg_vgg16.yml @@ -23,7 +23,7 @@ data: data_path: ./data/CINIC-10 # - download_url: https://iqua.ece.toronto.edu/~bli/CINIC-10.tar.gz + download_url: http://iqua.ece.toronto.edu/baochun/CINIC-10.tar.gz # Number of samples in each partition partition_size: 20000 diff --git a/plato/datasources/cinic10.py b/plato/datasources/cinic10.py index 1d654f832..4c04c82d4 100644 --- a/plato/datasources/cinic10.py +++ b/plato/datasources/cinic10.py @@ -26,7 +26,7 @@ def __init__(self): "Downloading the CINIC-10 dataset. This may take a while.") url = Config().data.download_url if hasattr( Config().data, 'download_url' - ) else 'https://iqua.ece.toronto.edu/~bli/CINIC-10.tar.gz' + ) else 'http://iqua.ece.toronto.edu/baochun/CINIC-10.tar.gz' DataSource.download(url, _path) _transform = transforms.Compose([