diff --git a/.signatures/cla.json b/.signatures/cla.json index 8bae9528c2..95f676600c 100644 --- a/.signatures/cla.json +++ b/.signatures/cla.json @@ -63,6 +63,14 @@ "created_at": "2021-08-11T21:53:41Z", "repoId": 329117231, "pullRequestNo": 145 + }, + { + "name": "katerina-merkulova", + "id": 87072230, + "comment_id": 900903309, + "created_at": "2021-08-18T07:58:20Z", + "repoId": 329117231, + "pullRequestNo": 156 } ] } \ No newline at end of file diff --git a/docs/.gitignore b/docs/.gitignore index a78ec1c59e..05b595d74c 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,5 +1,4 @@ openfl* models* -data* /_build **/.ipynb_checkpoints \ No newline at end of file diff --git a/docs/advanced_topics.rst b/docs/advanced_topics.rst index dafa8b5bc0..d5377c6d68 100644 --- a/docs/advanced_topics.rst +++ b/docs/advanced_topics.rst @@ -15,5 +15,4 @@ Advanced Topics overriding_agg_fn bash_autocomplete_activation log_metric_callback - - + data_splitting diff --git a/docs/data_splitting.rst b/docs/data_splitting.rst new file mode 100644 index 0000000000..33f0b326ef --- /dev/null +++ b/docs/data_splitting.rst @@ -0,0 +1,57 @@ +.. # Copyright (C) 2020-2021 Intel Corporation +.. # SPDX-License-Identifier: Apache-2.0 + +.. _data_splitting: +=============================== +Specifying custom data splits +=============================== + +------------------------------- +Usage +------------------------------- +|productName| allows developers to use custom data splits **for single-node simulation**. +In order to do this, you should: + +Python API +========== + +Choose from predefined |productName| aggregation functions: + +- ``openfl.plugins.data_splitters.EqualNumPyDataSplitter`` (default) +- ``openfl.plugins.data_splitters.RandomNumPyDataSplitter`` +- ``openfl.component.aggregation_functions.LogNormalNumPyDataSplitter`` - assumes ``data`` argument as ``np.ndarray`` of integers (labels) +- ``openfl.component.aggregation_functions.DirichletNumPyDataSplitter`` - assumes ``data`` argument as ``np.ndarray`` of integers (labels) +Or create an implementation of :class:`openfl.plugins.data_splitters.NumPyDataSplitter` +and pass it to FederatedDataset constructor as either ``train_splitter`` or ``valid_splitter`` keyword argument. + + +CLI +==== + +Choose from predefined |productName| aggregation functions: + +- ``openfl.plugins.data_splitters.EqualNumPyDataSplitter`` (default) +- ``openfl.plugins.data_splitters.RandomNumPyDataSplitter`` +- ``openfl.component.aggregation_functions.LogNormalNumPyDataSplitter`` - assumes ``data`` argument as np.ndarray of integers (labels) +- ``openfl.component.aggregation_functions.DirichletNumPyDataSplitter`` - assumes ``data`` argument as np.ndarray of integers (labels) +Or create your own implementation of :class:`openfl.component.aggregation_functions.AggregationFunctionInterface`. +After defining the splitting behavior, you need to use it on your data to perform a simulation. + +``NumPyDataSplitter`` requires a single ``split`` function. +This function receives ``data`` - NumPy array required to build the subsets of data indices (see definition of :meth:`openfl.plugins.data_splitters.NumPyDataSplitter.split`). It could be the whole dataset, or labels only, or anything else. +``split`` function returns a list of lists of indices which represent the collaborator-wise indices groups. + + .. code-block:: python + X_train, y_train = ... # train set + X_valid, y_valid = ... # valid set + train_splitter = RandomNumPyDataSplitter() + valid_splitter = RandomNumPyDataSplitter() + # collaborator_count value is passed to DataLoader constructor + # shard_num can be evaluated from data_path + train_idx = train_splitter.split(y_train, collaborator_count)[shard_num] + valid_idx = valid_splitter.split(y_valid, collaborator_count)[shard_num] + X_train_shard = X_train[train_idx] + X_valid_shard = X_valid[valid_idx] + +.. note:: + By default, we shuffle the data and perform equal split (see :class:`openfl.plugins.data_splitters.EqualNumPyDataSplitter`). diff --git a/openfl-tutorials/Federated_PyTorch_TinyImageNet.ipynb b/openfl-tutorials/Federated_PyTorch_TinyImageNet.ipynb index 763f24012b..9567b659e7 100644 --- a/openfl-tutorials/Federated_PyTorch_TinyImageNet.ipynb +++ b/openfl-tutorials/Federated_PyTorch_TinyImageNet.ipynb @@ -49,7 +49,7 @@ "from torchvision import transforms as T\n", "\n", "import openfl.native as fx\n", - "from openfl.federated import FederatedModel, FederatedDataSet\n" + "from openfl.federated import FederatedModel, FederatedDataSet" ] }, { @@ -119,22 +119,14 @@ " Valid option: [`train`, `val`]\n", " transform: torchvision.transforms\n", " A (series) of valid transformation(s).\n", - " collabs: int\n", - " How many dataset shards will be needed, minimum 1\n", - " shard_num: int\n", - " Current shard number, starting from 0\n", " \"\"\"\n", - " def __init__(self, root, split='train', collabs=1, shard_num=0, transform=None, target_transform=None):\n", - " assert collabs > shard_num, \"Incorrect shard number\"\n", + " def __init__(self, root, split='train', transform=None, target_transform=None):\n", " NUM_IMAGES_PER_CLASS = 500\n", " self.root = os.path.expanduser(root)\n", " self.transform = transform\n", " self.target_transform = target_transform\n", " self.split_dir = os.path.join(self.root, split)\n", " self.image_paths = sorted(glob.iglob(os.path.join(self.split_dir, '**', '*.JPEG'), recursive=True))\n", - " # DO the SHARDING\n", - " if split == 'train':\n", - " self.image_paths = self.image_paths[shard_num::collabs]\n", " \n", " self.labels = {} # fname - label number mapping\n", "\n", @@ -212,54 +204,51 @@ "metadata": {}, "outputs": [], "source": [ - "class TinyImagenetDataloader(FederatedDataSet):\n", - " def __init__(self,collaborator_count, rank, batch_size, **kwargs):\n", - " \"\"\"Instantiate the data object\n", - " Args:\n", - " data_path: The file path to the data\n", - " batch_size: The batch size of the data loader\n", - " **kwargs: Additional arguments, passed to super init and load_mnist_shard\n", - " \"\"\"\n", - " super().__init__([],[],[],[],batch_size, **kwargs)\n", - " \n", - " self.fed_size = int(collaborator_count)\n", - " self.rank = int(rank)\n", - " \n", - " self.batch_size = batch_size\n", - " self.shuffle = kwargs.setdefault('shuffle', True)\n", - "\n", - " self.training_set = TinyImageNet(TINY_IMAGENET_ROOT, 'train', transform=training_transform, \\\n", - " collabs=self.fed_size, shard_num=self.rank)\n", - " self.valid_set = TinyImageNet(TINY_IMAGENET_ROOT, 'val', transform=valid_transform, \\\n", - " target_transform=lambda target: one_hot(target, 200))\n", - "\n", - " self.train_loader = self.get_train_loader()\n", - " \n", - " self.val_loader = self.get_valid_loader()\n", - "\n", - " self.num_classes = 200\n", + "from openfl.plugins.data_splitters import EqualNumPyDataSplitter\n", + "from torch.utils.data import Subset\n", "\n", "\n", - " def get_valid_loader(self, num_batches=None):\n", - " return DataLoader(self.valid_set, batch_size=self.batch_size*2, num_workers=6)\n", - "\n", - " def get_train_loader(self, num_batches=None):\n", - " return DataLoader(self.training_set, batch_size=self.batch_size, shuffle=self.shuffle, num_workers=4)\n", + "train_set = TinyImageNet(TINY_IMAGENET_ROOT, 'train', transform=training_transform)\n", + "valid_set = TinyImageNet(TINY_IMAGENET_ROOT, 'val', transform=valid_transform, \\\n", + " target_transform=lambda target: one_hot(target, 200))\n", "\n", + "class TinyImageNetFederatedDataset(DataLoader):\n", + " def __init__(self, train_set, valid_set, batch_size):\n", + " self.data_splitter = EqualNumPyDataSplitter()\n", + " self.train_set = train_set\n", + " self.valid_set = valid_set\n", + " self.batch_size = batch_size\n", + " \n", + " def split(self, num_collaborators):\n", + " train_split = self.data_splitter.split([label for _, label in self.train_set], num_collaborators)\n", + " valid_split = self.data_splitter.split([label for _, label in self.valid_set], num_collaborators)\n", + " return [\n", + " TinyImageNetFederatedDataset(\n", + " Subset(self.train_set, train_split[i]),\n", + " Subset(self.valid_set, valid_split[i]),\n", + " self.batch_size\n", + " )\n", + " for i in range(num_collaborators)\n", + " ]\n", + " \n", + " def get_feature_shape(self):\n", + " return self.train_set[0][0].shape\n", + " \n", + " def get_train_loader(self, num_batches):\n", + " return DataLoader(self.train_set, batch_size=self.batch_size)\n", + " \n", + " def get_valid_loader(self):\n", + " return DataLoader(self.valid_set)\n", + " \n", " def get_train_data_size(self):\n", - " return len(self.training_set)\n", - "\n", + " return len(self.train_set)\n", + " \n", " def get_valid_data_size(self):\n", " return len(self.valid_set)\n", - "\n", - " def get_feature_shape(self):\n", - " return self.valid_set[0][0].shape\n", " \n", - " def split(self, num_collaborators, shuffle=True, equally=True):\n", - " return [TinyImagenetDataloader(num_collaborators, collab_rank, self.batch_size, shuffle=shuffle) \\\n", - " for collab_rank in range(num_collaborators)]\n", - " \n", - "fl_data = TinyImagenetDataloader(collaborator_count=1, rank=0, batch_size=32)" + "fl_data = TinyImageNetFederatedDataset(train_set, valid_set, batch_size=32)\n", + "\n", + "num_classes = 200" ] }, { @@ -281,7 +270,7 @@ " self.model = torchvision.models.mobilenet_v2(pretrained=True)\n", " self.model.requires_grad_(False)\n", " self.model.classifier[1] = torch.nn.Linear(in_features=1280, \\\n", - " out_features=fl_data.num_classes, bias=True)\n", + " out_features=num_classes, bias=True)\n", "\n", " def forward(self, x):\n", " x = self.model.forward(x)\n", @@ -320,7 +309,7 @@ "metadata": {}, "outputs": [], "source": [ - "collaborator_models = fl_model.setup(num_collaborators=2)\n", + "collaborator_models = fl_model.setup(num_collaborators=10)\n", "collaborators = {'one':collaborator_models[0],'two':collaborator_models[1]}#, 'three':collaborator_models[2]}" ] }, @@ -331,16 +320,13 @@ "outputs": [], "source": [ "#Original TinyImageNet dataset\n", - "print(f'Original training data size: {len(fl_data.training_set)}')\n", + "print(f'Original training data size: {len(fl_data.train_set)}')\n", "print(f'Original validation data size: {len(fl_data.valid_set)}\\n')\n", "\n", "#Collaborator one's data\n", - "print(f'Collaborator one\\'s training data size: {len(collaborator_models[0].data_loader.training_set)}')\n", - "print(f'Collaborator one\\'s validation data size: {len(collaborator_models[0].data_loader.valid_set)}\\n')\n", - "\n", - "#Collaborator two's data\n", - "print(f'Collaborator two\\'s training data size: {len(collaborator_models[1].data_loader.training_set)}')\n", - "print(f'Collaborator two\\'s validation data size: {len(collaborator_models[1].data_loader.valid_set)}\\n')\n", + "for i, model in enumerate(collaborator_models):\n", + " print(f'Collaborator {i}\\'s training data size: {len(model.data_loader.train_set)}')\n", + " print(f'Collaborator {i}\\'s validation data size: {len(model.data_loader.valid_set)}\\n')\n", "\n", "#Collaborator three's data\n", "#print(f'Collaborator three\\'s training data size: {len(collaborator_models[2].data_loader.X_train)}')\n", @@ -350,9 +336,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [], "source": [ "#Run experiment, return trained FederatedModel\n", @@ -369,6 +353,13 @@ "final_fl_model.save_native('final_model.pth')" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -378,22 +369,8 @@ } ], "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.8" + "name": "python" } }, "nbformat": 4, diff --git a/openfl-tutorials/interactive_api/Director_Pytorch_Kvasir_UNET/director_folder/director_config.yaml b/openfl-tutorials/interactive_api/Director_Pytorch_Kvasir_UNET/director_folder/director_config.yaml index bc52829c27..6d73f42176 100644 --- a/openfl-tutorials/interactive_api/Director_Pytorch_Kvasir_UNET/director_folder/director_config.yaml +++ b/openfl-tutorials/interactive_api/Director_Pytorch_Kvasir_UNET/director_folder/director_config.yaml @@ -1,5 +1,6 @@ settings: - listen_addr: localhost + listen_host: localhost listen_port: 50051 sample_shape: ['300', '400', '3'] - target_shape: ['300', '400'] \ No newline at end of file + target_shape: ['300', '400'] + envoy_health_check_period: 60 # in seconds \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Director_Pytorch_Kvasir_UNET/envoy_folder/kvasir_shard_descriptor_with_data_splitter.py b/openfl-tutorials/interactive_api/Director_Pytorch_Kvasir_UNET/envoy_folder/kvasir_shard_descriptor_with_data_splitter.py new file mode 100644 index 0000000000..aa5fe89165 --- /dev/null +++ b/openfl-tutorials/interactive_api/Director_Pytorch_Kvasir_UNET/envoy_folder/kvasir_shard_descriptor_with_data_splitter.py @@ -0,0 +1,136 @@ +# Copyright (C) 2020-2021 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""Kvasir shard descriptor.""" + + +import os +from pathlib import Path + +import numpy as np +from PIL import Image + +from openfl.interface.interactive_api.shard_descriptor import ShardDescriptor +from openfl.plugins.data_splitters import RandomNumPyDataSplitter +from openfl.utilities import validate_file_hash + + +class KvasirShardDescriptor(ShardDescriptor): + """Shard descriptor class.""" + + def __init__(self, data_folder: str = 'kvasir_data', + rank_worldsize: str = '1,1', + enforce_image_hw: str = None) -> None: + """Initialize KvasirShardDescriptor.""" + super().__init__() + + self.data_folder = Path.cwd() / data_folder + self.download_data(self.data_folder) + + # Settings for resizing data + self.enforce_image_hw = None + if enforce_image_hw is not None: + self.enforce_image_hw = tuple(int(size) for size in enforce_image_hw.split(',')) + # Settings for sharding the dataset + self.rank, self.worldsize = tuple(int(num) for num in rank_worldsize.split(',')) + + self.images_path = self.data_folder / 'segmented-images' / 'images' + self.masks_path = self.data_folder / 'segmented-images' / 'masks' + + self.images_names = [ + img_name + for img_name in sorted(os.listdir(self.images_path)) + if len(img_name) > 3 and img_name[-3:] == 'jpg' + ] + # Sharding + data_splitter = RandomNumPyDataSplitter() + shard_idx = data_splitter.split(self.images_names, self.worldsize)[self.rank] + self.images_names = [self.images_names[i] for i in shard_idx] + + # Calculating data and target shapes + sample, target = self[0] + self._sample_shape = [str(dim) for dim in sample.shape] + self._target_shape = [str(dim) for dim in target.shape] + + @staticmethod + def download_data(data_folder): + """Download data.""" + zip_file_path = data_folder / 'kvasir.zip' + os.makedirs(data_folder, exist_ok=True) + os.system('wget -nc' + " 'https://datasets.simula.no/hyper-kvasir/hyper-kvasir-segmented-images.zip'" + f' -O {zip_file_path.relative_to(Path.cwd())}') + zip_sha384 = ('e30d18a772c6520476e55b610a4db457237f151e', + '19182849d54b49ae24699881c1e18e0961f77642be900450ef8b22e7') + validate_file_hash(zip_file_path, zip_sha384) + os.system(f'unzip -n {zip_file_path.relative_to(Path.cwd())}' + f' -d {data_folder.relative_to(Path.cwd())}') + + def __getitem__(self, index): + """Return a item by the index.""" + name = self.images_names[index] + # Reading data + img = Image.open(self.images_path / name) + mask = Image.open(self.masks_path / name) + if self.enforce_image_hw is not None: + # If we need to resize data + # PIL accepts (w,h) tuple, not (h,w) + img = img.resize(self.enforce_image_hw[::-1]) + mask = mask.resize(self.enforce_image_hw[::-1]) + img = np.asarray(img) + mask = np.asarray(mask) + assert img.shape[2] == 3 + + return img, mask[:, :, 0].astype(np.uint8) + + def __len__(self): + """Return the len of the dataset.""" + return len(self.images_names) + + @property + def sample_shape(self): + """Return the sample shape info.""" + return self._sample_shape + + @property + def target_shape(self): + """Return the target shape info.""" + return self._target_shape + + @property + def dataset_description(self) -> str: + """Return the dataset description.""" + return f'Kvasir dataset, shard number {self.rank} out of {self.worldsize}' + + +if __name__ == '__main__': + from openfl.interface.cli import setup_logging + setup_logging() + + data_folder = 'data' + rank_worldsize = '1,100' + enforce_image_hw = '529,622' + + kvasir_sd = KvasirShardDescriptor( + data_folder, + rank_worldsize=rank_worldsize, + enforce_image_hw=enforce_image_hw) + + print(kvasir_sd.dataset_description) + print(kvasir_sd.sample_shape, kvasir_sd.target_shape) + + from openfl.component.envoy.envoy import Envoy + + shard_name = 'one' + director_uri = 'localhost:50051' + + keeper = Envoy( + shard_name=shard_name, + director_uri=director_uri, + shard_descriptor=kvasir_sd, + disable_tls=False, + root_ca='./cert/root_ca.crt', + key='./cert/one.key', + cert='./cert/one.crt', + ) + + keeper.start() diff --git a/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/director_folder/director_config.yaml b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/director_folder/director_config.yaml new file mode 100644 index 0000000000..c3aa1d5851 --- /dev/null +++ b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/director_folder/director_config.yaml @@ -0,0 +1,4 @@ +settings: + listen_ip: localhost + sample_shape: ['64', '128', '3'] + target_shape: ['2'] diff --git a/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/director_folder/start_director.sh b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/director_folder/start_director.sh new file mode 100644 index 0000000000..5806a6cc0a --- /dev/null +++ b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/director_folder/start_director.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +fx director start --disable-tls -c director_config.yaml \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/director_folder/start_director_with_tls.sh b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/director_folder/start_director_with_tls.sh new file mode 100644 index 0000000000..5d6d46a792 --- /dev/null +++ b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/director_folder/start_director_with_tls.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e +FQDN=$1 +fx director start -c director_config.yaml -rc cert/root_ca.crt -pk cert/"${FQDN}".key -oc cert/"${FQDN}".crt \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/market_shard_descriptor.py b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/market_shard_descriptor.py new file mode 100644 index 0000000000..61c456ac9e --- /dev/null +++ b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/market_shard_descriptor.py @@ -0,0 +1,121 @@ +# Copyright (C) 2020-2021 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""Market shard descriptor.""" + +import re +import zipfile +from pathlib import Path + +import gdown +from PIL import Image + +from openfl.interface.interactive_api.shard_descriptor import ShardDescriptor + + +class MarketShardDescriptor(ShardDescriptor): + """ + Market1501 Shard descriptor class. + + Reference: + Zheng et al. Scalable Person Re-identification: A Benchmark. ICCV 2015. + URL: http://www.liangzheng.org/Project/project_reid.html + + Dataset statistics: + identities: 1501 (+1 for background) + images: 12936 (train) + 3368 (query) + 15913 (gallery) + """ + + def __init__(self, datafolder: str = 'Market-1501-v15.09.15', + rank_worldsize: str = '1,1') -> None: + """Initialize MarketShardDescriptor.""" + super().__init__() + + # Settings for sharding the dataset + self.rank, self.worldsize = tuple(int(num) for num in rank_worldsize.split(',')) + + self.download() + self.pattern = re.compile(r'([-\d]+)_c(\d)') + self.dataset_dir = Path.cwd() / datafolder + self.train_dir = self.dataset_dir / 'bounding_box_train' + self.query_dir = self.dataset_dir / 'query' + self.gal_dir = self.dataset_dir / 'bounding_box_test' + self._check_before_run() + + self.train_path = list(self.train_dir.glob('*.jpg'))[self.rank - 1::self.worldsize] + self.query_path = list(self.query_dir.glob('*.jpg'))[self.rank - 1::self.worldsize] + self.gal_path = list(self.gal_dir.glob('*.jpg'))[self.rank - 1::self.worldsize] + + self.mode = 'train' + self.imgs_path = self.train_path + + def set_mode(self, mode='train'): + """Set mode for getitem.""" + self.mode = mode + if self.mode == 'train': + self.imgs_path = self.train_path + elif self.mode == 'query': + self.imgs_path = self.query_path + elif self.mode == 'gallery': + self.imgs_path = self.gal_path + else: + raise Exception(f'Wrong mode: {mode}') + + def __len__(self): + """Length of shard.""" + return len(self.imgs_path) + + def __getitem__(self, index: int): + """Return an item by the index.""" + img_path = self.imgs_path[index] + pid, camid = map(int, self.pattern.search(img_path.name).groups()) + + img = Image.open(img_path) + return img, (pid, camid) + + @property + def sample_shape(self): + """Return the sample shape info.""" + return ['64', '128', '3'] + + @property + def target_shape(self): + """Return the target shape info.""" + return ['2'] + + @property + def dataset_description(self) -> str: + """Return the dataset description.""" + return (f'Market dataset, shard number {self.rank} ' + f'out of {self.worldsize}') + + def _check_before_run(self): + """Check if all files are available before going deeper.""" + if not self.dataset_dir.exists(): + raise RuntimeError(f'{self.dataset_dir} is not available') + if not self.train_dir.exists(): + raise RuntimeError(f'{self.train_dir} is not available') + if not self.query_dir.exists(): + raise RuntimeError(f'{self.query_dir} is not available') + if not self.gal_dir.exists(): + raise RuntimeError(f'{self.gal_dir} is not available') + + @staticmethod + def download(): + """Download Market1501 dataset.""" + if Path('Market-1501-v15.09.15').exists(): + return None + + output = 'Market-1501-v15.09.15.zip' + if not Path(output).exists(): + url = 'https://drive.google.com/uc?id=0B8-rUzbwVRk0c054eEozWG9COHM' + gdown.download(url, output, quiet=False) + + with zipfile.ZipFile(output, 'r') as zip_ref: + zip_ref.extractall(Path.cwd()) + + Path(output).unlink() # remove zip + + +if __name__ == '__main__': + MarketShardDescriptor.download() diff --git a/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/requirements.txt b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/requirements.txt new file mode 100644 index 0000000000..2d61804504 --- /dev/null +++ b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/requirements.txt @@ -0,0 +1,2 @@ +gdown==3.13.0 +Pillow==8.3.1 \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/shard_config_one.yaml b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/shard_config_one.yaml new file mode 100644 index 0000000000..68de28f6a9 --- /dev/null +++ b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/shard_config_one.yaml @@ -0,0 +1,4 @@ +template: market_shard_descriptor.MarketShardDescriptor +params: + datafolder: Market-1501-v15.09.15 + rank_worldsize: 1,2 \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/shard_config_two.yaml b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/shard_config_two.yaml new file mode 100644 index 0000000000..64f149c17c --- /dev/null +++ b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/shard_config_two.yaml @@ -0,0 +1,4 @@ +template: market_shard_descriptor.MarketShardDescriptor +params: + datafolder: Market-1501-v15.09.15 + rank_worldsize: 2,2 \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/start_envoy.sh b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/start_envoy.sh new file mode 100644 index 0000000000..ef23ba6261 --- /dev/null +++ b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/start_envoy.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +fx envoy start -n env_one --disable-tls -dh localhost -dp 50051 -sc shard_config_one.yaml diff --git a/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/start_envoy_with_tls.sh b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/start_envoy_with_tls.sh new file mode 100644 index 0000000000..295f61b101 --- /dev/null +++ b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/envoy_folder/start_envoy_with_tls.sh @@ -0,0 +1,6 @@ +#!/bin/bash +set -e +ENVOY_NAME=$1 +DIRECTOR_FQDN=$2 + +fx envoy start -n "$ENVOY_NAME" --shard-config-path shard_config.yaml -d "$DIRECTOR_FQDN":50051 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/Market_with_Director.ipynb b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/Market_with_Director.ipynb new file mode 100644 index 0000000000..f6575a4704 --- /dev/null +++ b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/Market_with_Director.ipynb @@ -0,0 +1,603 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "liquid-jacket", + "metadata": {}, + "source": [ + "# Federated Market with Director example\n", + "## Using low-level Python API" + ] + }, + { + "cell_type": "markdown", + "id": "af0579f8", + "metadata": {}, + "source": [ + "# Long-Living entities update\n", + "\n", + "* We now may have director running on another machine.\n", + "* We use Federation API to communicate with Director.\n", + "* Federation object should hold a Director's client (for user service)\n", + "* Keeping in mind that several API instances may be connacted to one Director.\n", + "\n", + "\n", + "* We do not think for now how we start a Director.\n", + "* But it knows the data shape and target shape for the DataScience problem in the Federation.\n", + "* Director holds the list of connected envoys, we do not need to specify it anymore.\n", + "* Director and Envoys are responsible for encrypting connections, we do not need to worry about certs.\n", + "\n", + "\n", + "* Yet we MUST have a cert to communicate to the Director.\n", + "* We MUST know the FQDN of a Director.\n", + "* Director communicates data and target shape to the Federation interface object.\n", + "\n", + "\n", + "* Experiment API may use this info to construct a dummy dataset and a `shard descriptor` stub." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "db949008", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# Install dependencies if not already installed\n", + "# !pip install -r requirements.txt" + ] + }, + { + "cell_type": "markdown", + "id": "16986f22", + "metadata": {}, + "source": [ + "# Connect to the Federation" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4485ac79", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a federation\n", + "from openfl.interface.interactive_api.federation import Federation\n", + "\n", + "# please use the same identificator that was used in signed certificate\n", + "cliend_id = 'frontend'\n", + "\n", + "# 1) Run with API layer - Director mTLS \n", + "# If the user wants to enable mTLS their must provide CA root chain, and signed key pair to the federation interface\n", + "# cert_chain = 'cert/root_ca.crt'\n", + "# API_certificate = 'cert/frontend.crt'\n", + "# API_private_key = 'cert/frontend.key'\n", + "\n", + "# federation = Federation(client_id='frontend', director_node_fqdn='localhost', director_port='50051', disable_tls=False,\n", + "# cert_chain=cert_chain, api_cert=API_certificate, api_private_key=API_private_key)\n", + "\n", + "# --------------------------------------------------------------------------------------------------------------------\n", + "\n", + "# 2) Run with TLS disabled (trusted environment)\n", + "# Federation can also determine local fqdn automatically\n", + "federation = Federation(client_id='frontend', director_node_fqdn='localhost', director_port='50051', tls=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e35802d5", + "metadata": { + "scrolled": true, + "pycharm": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "shard_registry = federation.get_shard_registry()\n", + "shard_registry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67ae50de", + "metadata": { + "pycharm": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "federation.target_shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b42efc49", + "metadata": {}, + "outputs": [], + "source": [ + "# First, request a dummy_shard_desc that holds information about the federated dataset \n", + "dummy_shard_desc = federation.get_dummy_shard_descriptor(size=10)\n", + "sample, target = dummy_shard_desc[0]" + ] + }, + { + "cell_type": "markdown", + "id": "obvious-tyler", + "metadata": {}, + "source": [ + "## Creating a FL experiment using Interactive API" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "rubber-address", + "metadata": {}, + "outputs": [], + "source": [ + "from openfl.interface.interactive_api.experiment import TaskInterface, DataInterface, ModelInterface, FLExperiment" + ] + }, + { + "cell_type": "markdown", + "id": "sustainable-public", + "metadata": {}, + "source": [ + "### Register dataset" + ] + }, + { + "cell_type": "markdown", + "id": "unlike-texas", + "metadata": {}, + "source": [ + "We extract User dataset class implementation.\n", + "Is it convinient?\n", + "What if the dataset is not a class?" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "64f37dcf", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "from copy import deepcopy\n", + "\n", + "from torch.utils.data import DataLoader, Dataset\n", + "from torchvision.transforms import Compose, Normalize, RandomHorizontalFlip, Resize, ToTensor\n", + "\n", + "from tools import RandomIdentitySampler\n", + "import transforms as T\n", + "\n", + "\n", + "# Now you can implement you data loaders using dummy_shard_desc\n", + "class ImageDataset(Dataset):\n", + " \"\"\"Image Person ReID Dataset.\"\"\"\n", + "\n", + " def __init__(self, dataset, transform=None):\n", + " \"\"\"Initialize Dataset.\"\"\"\n", + " self.dataset = dataset\n", + " self.transform = transform\n", + "\n", + " def __len__(self):\n", + " \"\"\"Length of dataset.\"\"\"\n", + " return len(self.dataset)\n", + "\n", + " def __getitem__(self, index):\n", + " \"\"\"Get item from dataset.\"\"\"\n", + " img, (pid, camid) = self.dataset[index]\n", + " if self.transform is not None:\n", + " img = self.transform(img)\n", + " return img, (pid, camid)\n", + "\n", + "\n", + "class MarketFLDataloader(DataInterface):\n", + " \"\"\"Market Dataset.\"\"\"\n", + "\n", + " def __init__(self, **kwargs):\n", + " super().__init__(**kwargs)\n", + "\n", + " # Prepare transforms\n", + " self.transform_train = Compose([\n", + " T.ResizeRandomCropping(256, 128, p=0.5),\n", + " RandomHorizontalFlip(),\n", + " ToTensor(),\n", + " Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + " T.RandomErasing(probability=0.5)\n", + " ])\n", + " self.transform_test = Compose([\n", + " Resize((265, 128)),\n", + " ToTensor(),\n", + " Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", + " ])\n", + "\n", + " @property\n", + " def shard_descriptor(self):\n", + " return self._shard_descriptor\n", + "\n", + " @shard_descriptor.setter\n", + " def shard_descriptor(self, shard_descriptor):\n", + " \"\"\"\n", + " Describe per-collaborator procedures or sharding.\n", + "\n", + " This method will be called during a collaborator initialization.\n", + " Local shard_descriptor will be set by Envoy.\n", + " \"\"\"\n", + " self._shard_descriptor = shard_descriptor\n", + "\n", + " def get_train_loader(self, **kwargs):\n", + " \"\"\"\n", + " Output of this method will be provided to tasks with optimizer in contract.\n", + " \"\"\"\n", + " if self.kwargs['train_bs']:\n", + " batch_size = self.kwargs['train_bs']\n", + " else:\n", + " batch_size = 64\n", + "\n", + " self.shard_descriptor.set_mode('train')\n", + " return DataLoader(\n", + " # ImageDataset make transform\n", + " ImageDataset(self.shard_descriptor, transform=self.transform_train),\n", + " sampler=RandomIdentitySampler(self.shard_descriptor, num_instances=4),\n", + " batch_size=batch_size, num_workers=4, pin_memory=True, drop_last=True\n", + " )\n", + "\n", + " def get_valid_loader(self, **kwargs):\n", + " \"\"\"\n", + " Output of this method will be provided to tasks without optimizer in contract.\n", + " \"\"\"\n", + " if self.kwargs['valid_bs']:\n", + " batch_size = self.kwargs['valid_bs']\n", + " else:\n", + " batch_size = 512\n", + "\n", + " query_sd_copy = deepcopy(self.shard_descriptor)\n", + " query_sd_copy.set_mode('query')\n", + " query_loader = DataLoader(ImageDataset(query_sd_copy, transform=self.transform_test),\n", + " batch_size=batch_size, num_workers=4, pin_memory=True,\n", + " drop_last=False, shuffle=False)\n", + "\n", + " gallery_sd_copy = deepcopy(self.shard_descriptor)\n", + " gallery_sd_copy.set_mode('gallery')\n", + " gallery_loader = DataLoader(ImageDataset(gallery_sd_copy, transform=self.transform_test),\n", + " batch_size=batch_size, num_workers=4, pin_memory=True,\n", + " drop_last=False, shuffle=False)\n", + "\n", + " return query_loader, gallery_loader\n", + "\n", + " def get_train_data_size(self):\n", + " \"\"\"\n", + " Information for aggregation.\n", + " \"\"\"\n", + " return len(self.shard_descriptor.train_path)\n", + "\n", + " def get_valid_data_size(self):\n", + " \"\"\"\n", + " Information for aggregation.\n", + " \"\"\"\n", + " return len(self.shard_descriptor.gal_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "8cb6c73c", + "metadata": {}, + "outputs": [], + "source": [ + "fed_dataset = MarketFLDataloader(train_bs=64, valid_bs=512)" + ] + }, + { + "cell_type": "markdown", + "id": "caring-distinction", + "metadata": {}, + "source": [ + "### Describe a model and optimizer" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "visible-victor", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim\n", + "import torchvision" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "foreign-gospel", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "ResNet and Classifier definition\n", + "\"\"\"\n", + "\n", + "class ResNet50(nn.Module):\n", + " \"Pretrained ResNet50.\"\n", + "\n", + " def __init__(self, **kwargs):\n", + " super().__init__()\n", + " \n", + " self.classifier = NormalizedClassifier()\n", + "\n", + " resnet50 = torchvision.models.resnet50(pretrained=True)\n", + " resnet50.layer4[0].conv2.stride = (1, 1)\n", + " resnet50.layer4[0].downsample[0].stride = (1, 1)\n", + " self.base = nn.Sequential(*list(resnet50.children())[:-2])\n", + "\n", + " self.bn = nn.BatchNorm1d(2048)\n", + " nn.init.normal_(self.bn.weight.data, 1.0, 0.02)\n", + " nn.init.constant_(self.bn.bias.data, 0.0)\n", + "\n", + " def forward(self, x):\n", + " x = self.base(x)\n", + " x = nn.functional.avg_pool2d(x, x.size()[2:])\n", + " x = x.view(x.size(0), -1)\n", + " f = self.bn(x)\n", + "\n", + " return f\n", + "\n", + "\n", + "class NormalizedClassifier(nn.Module):\n", + " \"\"\"Classifier.\"\"\"\n", + "\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.weight = nn.Parameter(torch.Tensor(1501, 2048))\n", + " self.weight.data.uniform_(-1, 1).renorm_(2,0,1e-5).mul_(1e5)\n", + "\n", + " def forward(self, x):\n", + " w = self.weight\n", + "\n", + " x = nn.functional.normalize(x, p=2, dim=1)\n", + " w = nn.functional.normalize(w, p=2, dim=1)\n", + "\n", + " return nn.functional.linear(x, w)\n", + "\n", + "\n", + "resnet = ResNet50()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "greater-activation", + "metadata": { + "pycharm": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "parameters = list(resnet.parameters()) + list(resnet.classifier.parameters())\n", + "optimizer_adam = optim.Adam(parameters, lr=1e-4)" + ] + }, + { + "cell_type": "markdown", + "id": "caroline-passion", + "metadata": {}, + "source": [ + "#### Register model" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "handled-teens", + "metadata": {}, + "outputs": [], + "source": [ + "framework_adapter = 'openfl.plugins.frameworks_adapters.pytorch_adapter.FrameworkAdapterPlugin'\n", + "MI = ModelInterface(model=resnet, optimizer=optimizer_adam, framework_plugin=framework_adapter)\n", + "# Save the initial model state\n", + "initial_model = deepcopy(resnet)" + ] + }, + { + "cell_type": "markdown", + "id": "portuguese-groove", + "metadata": {}, + "source": [ + "### Define and register FL tasks" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "increasing-builder", + "metadata": {}, + "outputs": [], + "source": [ + "TI = TaskInterface()\n", + "\n", + "from logging import getLogger\n", + "\n", + "import torch\n", + "import tqdm\n", + "\n", + "from losses import ArcFaceLoss, TripletLoss\n", + "from tools import AverageMeter, evaluate, extract_feature\n", + "\n", + "logger = getLogger(__name__)\n", + "\n", + "# Task interface currently supports only standalone functions.\n", + "@TI.register_fl_task(model='model', data_loader='train_loader',\n", + " device='device', optimizer='optimizer')\n", + "def train(model, train_loader, optimizer, device):\n", + " device = torch.device('cuda')\n", + " \n", + " criterion_cla = ArcFaceLoss(scale=16., margin=0.1)\n", + " criterion_pair = TripletLoss(margin=0.3, distance='cosine')\n", + "\n", + " batch_cla_loss = AverageMeter()\n", + " batch_pair_loss = AverageMeter()\n", + " corrects = AverageMeter()\n", + " \n", + " model.train()\n", + " model.to(device)\n", + " model.classifier.train()\n", + " model.classifier.to(device)\n", + " \n", + " logger.info('==> Start training')\n", + " train_loader = tqdm.tqdm(train_loader, desc='train')\n", + "\n", + " for imgs, (pids, _) in train_loader:\n", + " imgs, pids = torch.tensor(imgs).to(device), torch.tensor(pids).to(device)\n", + " # Zero the parameter gradients\n", + " optimizer.zero_grad()\n", + " # Forward\n", + " features = model(imgs)\n", + " outputs = model.classifier(features)\n", + " _, preds = torch.max(outputs.data, 1)\n", + " # Compute loss\n", + " cla_loss = criterion_cla(outputs, pids)\n", + " pair_loss = criterion_pair(features, pids)\n", + " loss = cla_loss + pair_loss\n", + " # Backward + Optimize\n", + " loss.backward()\n", + " optimizer.step()\n", + " # statistics\n", + " corrects.update(torch.sum(preds == pids.data).float() / pids.size(0), pids.size(0))\n", + " batch_cla_loss.update(cla_loss.item(), pids.size(0))\n", + " batch_pair_loss.update(pair_loss.item(), pids.size(0))\n", + "\n", + " return {'ArcFaceLoss': batch_cla_loss.avg,\n", + " 'TripletLoss': batch_pair_loss.avg,\n", + " 'Accuracy': corrects.avg.cpu()}\n", + "\n", + "\n", + "@TI.register_fl_task(model='model', data_loader='val_loader', device='device')\n", + "def validate(model, val_loader, device):\n", + " queryloader, galleryloader = val_loader\n", + " device = torch.device('cuda')\n", + " \n", + " logger.info('==> Start validating')\n", + " model.eval()\n", + " model.to(device)\n", + " \n", + " # Extract features for query set\n", + " qf, q_pids, q_camids = extract_feature(model, queryloader)\n", + " logger.info(f'Extracted features for query set, obtained {qf.shape} matrix')\n", + " # Extract features for gallery set\n", + " gf, g_pids, g_camids = extract_feature(model, galleryloader)\n", + " logger.info(f'Extracted features for gallery set, obtained {gf.shape} matrix')\n", + " # Compute distance matrix between query and gallery\n", + " m, n = qf.size(0), gf.size(0)\n", + " distmat = torch.zeros((m,n))\n", + " # Cosine similarity\n", + " qf = nn.functional.normalize(qf, p=2, dim=1)\n", + " gf = nn.functional.normalize(gf, p=2, dim=1)\n", + " for i in range(m):\n", + " distmat[i] = - torch.mm(qf[i:i+1], gf.t())\n", + " distmat = distmat.numpy()\n", + "\n", + " cmc, mAP = evaluate(distmat, q_pids, g_pids, q_camids, g_camids)\n", + " return {'top1': cmc[0], 'top5': cmc[4], 'top10': cmc[9], 'mAP': mAP}" + ] + }, + { + "cell_type": "markdown", + "id": "derived-bride", + "metadata": {}, + "source": [ + "## Time to start a federated learning experiment" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "mature-renewal", + "metadata": {}, + "outputs": [], + "source": [ + "# create an experimnet in federation\n", + "experiment_name = 'market_test_experiment'\n", + "fl_experiment = FLExperiment(federation=federation, experiment_name=experiment_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "lightweight-causing", + "metadata": { + "scrolled": false, + "pycharm": { + "is_executing": true + } + }, + "outputs": [], + "source": [ + "# If I use autoreload I got a pickling error\n", + "\n", + "# The following command zips the workspace and python requirements to be transfered to collaborator nodes\n", + "fl_experiment.start(model_provider=MI, \n", + " task_keeper=TI,\n", + " data_loader=fed_dataset,\n", + " rounds_to_train=3,\n", + " opt_treatment='RESET')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bfc4f89c", + "metadata": { + "pycharm": { + "name": "#%%\n", + "is_executing": true + } + }, + "outputs": [], + "source": [ + "# If user want to stop IPython session, then reconnect and check how experiment is going \n", + "# fl_experiment.restore_experiment_state(MI)\n", + "\n", + "fl_experiment.stream_metrics(tensorboard_logs=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/losses.py b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/losses.py new file mode 100644 index 0000000000..be266a4e13 --- /dev/null +++ b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/losses.py @@ -0,0 +1,98 @@ +# Copyright (C) 2020-2021 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""Compute ArcFace loss and Triplet loss.""" + +import math + +import torch +import torch.nn.functional as F +from torch import nn + + +class ArcFaceLoss(nn.Module): + """ArcFace loss.""" + + def __init__(self, margin=0.1, scale=16, easy_margin=False): + """Initialize ArcFace loss.""" + super(ArcFaceLoss, self).__init__() + self.m = margin + self.s = scale + self.easy_margin = easy_margin + + def forward(self, pred, target): + """Compute forward.""" + # make a one-hot index + index = pred.data * 0.0 # size = (B, Classnum) + index.scatter_(1, target.data.view(-1, 1), 1) + index = index.bool() + + cos_m = math.cos(self.m) + sin_m = math.sin(self.m) + cos_t = pred[index] + sin_t = torch.sqrt(1.0 - cos_t * cos_t) + cos_t_add_m = cos_t * cos_m - sin_t * sin_m + + cond_v = cos_t - math.cos(math.pi - self.m) + cond = F.relu(cond_v) + keep = cos_t - math.sin(math.pi - self.m) * self.m + + cos_t_add_m = torch.where(cond.bool(), cos_t_add_m, keep) + + output = pred * 1.0 # size = (B, Classnum) + output[index] = cos_t_add_m + output = self.s * output + + return F.cross_entropy(output, target) + + +class TripletLoss(nn.Module): + """ + Triplet loss with hard positive/negative mining. + + Reference: + Hermans et al. In Defense of the Triplet Loss for Person Re-Identification. arXiv:1703.07737. + + Code imported from https://github.com/Cysu/open-reid/blob/master/reid/loss/triplet.py. + + Args: + margin (float): margin for triplet. + distance (str): distance for triplet. + """ + + def __init__(self, margin=0.3, distance='cosine'): + """Initialize Triplet loss.""" + super(TripletLoss, self).__init__() + + self.distance = distance + self.margin = margin + self.ranking_loss = nn.MarginRankingLoss(margin=margin) + + def forward(self, inputs, targets): + """ + Compute forward. + + Args: + inputs: feature matrix with shape (batch_size, feat_dim) + targets: ground truth labels with shape (num_classes) + """ + n = inputs.size(0) + + # Compute pairwise distance, replace by the official when merged + inputs = F.normalize(inputs, p=2, dim=1) + dist = - torch.mm(inputs, inputs.t()) + + # For each anchor, find the hardest positive and negative + mask = targets.expand(n, n).eq(targets.expand(n, n).t()) + dist_ap, dist_an = [], [] + for i in range(n): + dist_ap.append(dist[i][mask[i]].max().unsqueeze(0)) + dist_an.append(dist[i][mask[i] == 0].min().unsqueeze(0)) + dist_ap = torch.cat(dist_ap) + dist_an = torch.cat(dist_an) + + # Compute ranking hinge loss + y = torch.ones_like(dist_an) + loss = self.ranking_loss(dist_an, dist_ap, y) + + return loss diff --git a/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/requirements.txt b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/requirements.txt new file mode 100644 index 0000000000..5e3f540be5 --- /dev/null +++ b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/requirements.txt @@ -0,0 +1,2 @@ +torch==1.9.0 +torchvision==0.10.0 diff --git a/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/tools.py b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/tools.py new file mode 100644 index 0000000000..7894ca478a --- /dev/null +++ b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/tools.py @@ -0,0 +1,190 @@ +# Copyright (C) 2020-2021 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""Tools for metric computation and Dataloader.""" + +import copy +import random +from collections import defaultdict +from logging import getLogger + +import numpy as np +import torch +from torch.utils.data.sampler import Sampler + +logger = getLogger(__name__) + + +class AverageMeter(object): + """ + Computes and stores the average and current value. + + Code imported from https://github.com/pytorch/examples/blob/master/imagenet/main.py#L247-L262 + """ + + def __init__(self): + """Initialize Average Meter.""" + self.reset() + + def reset(self): + """Reset values.""" + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + """Update values.""" + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def compute_ap_cmc(index, good_index, junk_index): + """Compute validation metrics.""" + ap = 0 + cmc = np.zeros(len(index)) + + # remove junk_index + mask = np.in1d(index, junk_index, invert=True) + index = index[mask] + + # find good_index index + ngood = len(good_index) + mask = np.in1d(index, good_index) + rows_good = np.argwhere(mask) + rows_good = rows_good.flatten() + + cmc[rows_good[0]:] = 1.0 + for i in range(ngood): + d_recall = 1.0 / ngood + precision = (i + 1) * 1.0 / (rows_good[i] + 1) + ap = ap + d_recall * precision + + return ap, cmc + + +def evaluate(distmat, q_pids, g_pids, q_camids, g_camids): + """Evaluate model.""" + num_q, num_g = distmat.shape + index = np.argsort(distmat, axis=1) # from small to large + + num_no_gt = 0 # num of query imgs without groundtruth + num_r1 = 0 + cmc = np.zeros(len(g_pids)) + ap = 0 + + for i in range(num_q): + # groundtruth index + query_index = np.argwhere(g_pids == q_pids[i]) + camera_index = np.argwhere(g_camids == q_camids[i]) + good_index = np.setdiff1d(query_index, camera_index, assume_unique=True) + if good_index.size == 0: + num_no_gt += 1 + continue + # remove gallery samples that have the same pid and camid with query + junk_index = np.intersect1d(query_index, camera_index) + + ap_tmp, cmc_tmp = compute_ap_cmc(index[i], good_index, junk_index) + if cmc_tmp[0] == 1: + num_r1 += 1 + cmc = cmc + cmc_tmp + ap += ap_tmp + + if num_no_gt > 0: + logger.error(f'{num_no_gt} query imgs do not have groundtruth.') + + cmc = cmc / (num_q - num_no_gt) + mean_ap = ap / (num_q - num_no_gt) + + return cmc, mean_ap + + +@torch.no_grad() +def extract_feature(model, dataloader): + """Extract features for validation.""" + features, pids, camids = [], [], [] + for imgs, (batch_pids, batch_camids) in dataloader: + flip_imgs = fliplr(imgs) + imgs, flip_imgs = imgs.cuda(), flip_imgs.cuda() + batch_features = model(imgs).data + batch_features_flip = model(flip_imgs).data + batch_features += batch_features_flip + + features.append(batch_features) + pids.append(batch_pids) + camids.append(batch_camids) + features = torch.cat(features, 0) + pids = torch.cat(pids, 0).numpy() + camids = torch.cat(camids, 0).numpy() + + return features, pids, camids + + +def fliplr(img): + """Flip horizontal.""" + inv_idx = torch.arange(img.size(3) - 1, -1, -1).long() # N x C x H x W + img_flip = img.index_select(3, inv_idx) + + return img_flip + + +class RandomIdentitySampler(Sampler): + """ + Random Sampler. + + Randomly sample N identities, then for each identity, + randomly sample K instances, therefore batch size is N*K. + + Args: + - data_source (Dataset): dataset to sample from. + - num_instances (int): number of instances per identity. + """ + + def __init__(self, data_source, num_instances=4): + """Initialize Sampler.""" + self.data_source = data_source + self.num_instances = num_instances + self.index_dic = defaultdict(list) + for index, (_, (pid, _)) in enumerate(data_source): + self.index_dic[pid].append(index) + self.pids = list(self.index_dic.keys()) + self.num_identities = len(self.pids) + + # compute number of examples in an epoch + self.length = 0 + for pid in self.pids: + idxs = self.index_dic[pid] + num = len(idxs) + if num < self.num_instances: + num = self.num_instances + self.length += num - num % self.num_instances + + def __iter__(self): + """Iterate over Sampler.""" + list_container = [] + + for pid in self.pids: + idxs = copy.deepcopy(self.index_dic[pid]) + if len(idxs) < self.num_instances: + idxs = np.random.choice(idxs, size=self.num_instances, replace=True) + random.shuffle(idxs) + batch_idxs = [] + for idx in idxs: + batch_idxs.append(idx) + if len(batch_idxs) == self.num_instances: + list_container.append(batch_idxs) + batch_idxs = [] + + random.shuffle(list_container) + + ret = [] + for batch_idxs in list_container: + ret.extend(batch_idxs) + + return iter(ret) + + def __len__(self): + """Return number of examples in an epoch.""" + return self.length diff --git a/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/transforms.py b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/transforms.py new file mode 100644 index 0000000000..e0bc5430fd --- /dev/null +++ b/openfl-tutorials/interactive_api/Pytorch_Market_Re-ID/workspace/transforms.py @@ -0,0 +1,103 @@ +# Copyright (C) 2020-2021 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +"""Image transform tools.""" + +import math +import random + +from PIL import Image + + +class ResizeRandomCropping(object): + """ + With a probability, first increase image size to (1 + 1/8), and then perform random crop. + + Args: + height (int): target height. + width (int): target width. + p (float): probability of performing this transformation. Default: 0.5. + """ + + def __init__(self, height, width, p=0.5, interpolation=Image.BILINEAR): + """Initialize cropping.""" + self.height = height + self.width = width + self.p = p + self.interpolation = interpolation + + def __call__(self, img): + """ + Call of cropping. + + Args: + img (PIL Image): Image to be cropped. + Returns: + PIL Image: Cropped image. + """ + if random.uniform(0, 1) >= self.p: + return img.resize((self.width, self.height), self.interpolation) + + new_width, new_height = int(round(self.width * 1.125)), int(round(self.height * 1.125)) + resized_img = img.resize((new_width, new_height), self.interpolation) + x_maxrange = new_width - self.width + y_maxrange = new_height - self.height + x1 = int(round(random.uniform(0, x_maxrange))) + y1 = int(round(random.uniform(0, y_maxrange))) + cropped_img = resized_img.crop((x1, y1, x1 + self.width, y1 + self.height)) + + return cropped_img + + +class RandomErasing(object): + """ + Randomly selects a rectangle region in an image and erases its pixels. + + 'Random Erasing Data Augmentation' by Zhong et al. + See https://arxiv.org/pdf/1708.04896.pdf + + Args: + probability: The probability that the Random Erasing operation will be performed. + sl: Minimum proportion of erased area against input image. + sh: Maximum proportion of erased area against input image. + r1: Minimum aspect ratio of erased area. + mean: Erasing value. + """ + + def __init__(self, probability=0.5, sl=0.02, sh=0.4, r1=0.3, mean=None): + """Initialize Erasing.""" + if not mean: + mean = [0.4914, 0.4822, 0.4465] + + self.probability = probability + self.mean = mean + self.sl = sl + self.sh = sh + self.r1 = r1 + + def __call__(self, img): + """Call of Erasing.""" + if random.uniform(0, 1) >= self.probability: + return img + + for _attempt in range(100): + area = img.size()[1] * img.size()[2] + + target_area = random.uniform(self.sl, self.sh) * area + aspect_ratio = random.uniform(self.r1, 1 / self.r1) + + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + + if w < img.size()[2] and h < img.size()[1]: + x1 = random.randint(0, img.size()[1] - h) + y1 = random.randint(0, img.size()[2] - w) + if img.size()[0] == 3: + img[0, x1:x1 + h, y1:y1 + w] = self.mean[0] + img[1, x1:x1 + h, y1:y1 + w] = self.mean[1] + img[2, x1:x1 + h, y1:y1 + w] = self.mean[2] + else: + img[0, x1:x1 + h, y1:y1 + w] = self.mean[0] + return img + + return img diff --git a/openfl-workspace/default/director.yaml b/openfl-workspace/default/director.yaml index 32fd4caa30..fa6d0544e4 100644 --- a/openfl-workspace/default/director.yaml +++ b/openfl-workspace/default/director.yaml @@ -4,8 +4,11 @@ # Director's config. # Parameters: # 1. sample_shape - sample shape interface unified across the Federation -# 1. target_shape - target shape interface unified across the Federation +# 2. target_shape - target shape interface unified across the Federation settings: sample_shape: [] - target_shape: [] \ No newline at end of file + target_shape: [] + listen_host: localhost # listen FQDN or ip + listen_port: 50051 # listen port + envoy_health_check_period: 60 # in seconds \ No newline at end of file diff --git a/openfl-workspace/tf_cnn_histology/plan/plan.yaml b/openfl-workspace/tf_cnn_histology/plan/plan.yaml index 3fe6f8809a..52a660f949 100644 --- a/openfl-workspace/tf_cnn_histology/plan/plan.yaml +++ b/openfl-workspace/tf_cnn_histology/plan/plan.yaml @@ -61,5 +61,4 @@ tasks: batch_size: 32 epochs: 1 metrics: - - loss - num_batches: 1 + - loss \ No newline at end of file diff --git a/openfl-workspace/workspace/plan/defaults/tasks_tensorflow.yaml b/openfl-workspace/workspace/plan/defaults/tasks_tensorflow.yaml index 586a885b40..6d000cc618 100644 --- a/openfl-workspace/workspace/plan/defaults/tasks_tensorflow.yaml +++ b/openfl-workspace/workspace/plan/defaults/tasks_tensorflow.yaml @@ -18,6 +18,6 @@ train: function : train_batches kwargs : batch_size : 32 - num_batches : 1 metrics : - loss + epochs : 1 diff --git a/openfl-workspace/workspace/plan/defaults/tasks_torch.yaml b/openfl-workspace/workspace/plan/defaults/tasks_torch.yaml index a240c2003b..f41b0c3600 100644 --- a/openfl-workspace/workspace/plan/defaults/tasks_torch.yaml +++ b/openfl-workspace/workspace/plan/defaults/tasks_torch.yaml @@ -17,3 +17,4 @@ train: kwargs : metrics : - loss + epochs : 1 diff --git a/openfl/component/ca/ca.py b/openfl/component/ca/ca.py index 3b3aad9021..1446ad5007 100644 --- a/openfl/component/ca/ca.py +++ b/openfl/component/ca/ca.py @@ -21,6 +21,10 @@ logger = getLogger(__name__) TOKEN_DELIMITER = '.' +CA_STEP_CONFIG_DIR = Path('step_config') +CA_PKI_DIR = Path('cert') +CA_PASSWORD_FILE = Path('pass_file') +CA_CONFIG_JSON = Path('config/ca.json') def download_step_bin(url, grep_name, architecture, prefix='.', confirmation=True): @@ -66,14 +70,14 @@ def get_token(name, ca_url, ca_path='.'): ca_path: path to ca folder """ ca_path = Path(ca_path) - step_config_dir = ca_path / 'step_config' - pki_dir = ca_path / 'cert' + step_config_dir = ca_path / CA_STEP_CONFIG_DIR + pki_dir = ca_path / CA_PKI_DIR step_path, _ = get_ca_bin_paths(ca_path) if not step_path: raise Exception('Step-CA is not installed!\nRun `fx pki install` first') priv_json = step_config_dir / 'secrets' / 'priv.json' - pass_file = pki_dir / 'pass_file' + pass_file = pki_dir / CA_PASSWORD_FILE root_crt = step_config_dir / 'certs' / 'root_ca.crt' try: token = subprocess.check_output( @@ -154,7 +158,7 @@ def install(ca_path, ca_url, password): ca_path = Path(ca_path) ca_path.mkdir(parents=True, exist_ok=True) - step_config_dir = ca_path / 'step_config' + step_config_dir = ca_path / CA_STEP_CONFIG_DIR os.environ['STEPPATH'] = str(step_config_dir) step_path, step_ca_path = get_ca_bin_paths(ca_path) @@ -164,7 +168,7 @@ def install(ca_path, ca_url, password): download_step_bin(url, 'step-ca_linux', 'amd', prefix=ca_path, confirmation=False) url = 'http://api.github.com/repos/smallstep/cli/releases/latest' download_step_bin(url, 'step_linux', 'amd', prefix=ca_path, confirmation=False) - step_config_dir = ca_path / 'step_config' + step_config_dir = ca_path / CA_STEP_CONFIG_DIR if (not step_config_dir.exists() or confirm('CA exists, do you want to recreate it?', default=True)): _create_ca(ca_path, ca_url, password) @@ -200,8 +204,8 @@ def _check_kill_process(pstring, confirmation=False): def _create_ca(ca_path: Path, ca_url: str, password: str): """Create a ca workspace.""" - pki_dir = ca_path / 'cert' - step_config_dir = ca_path / 'step_config' + pki_dir = ca_path / CA_PKI_DIR + step_config_dir = ca_path / CA_STEP_CONFIG_DIR pki_dir.mkdir(parents=True, exist_ok=True) step_config_dir.mkdir(parents=True, exist_ok=True) @@ -229,7 +233,7 @@ def _create_ca(ca_path: Path, ca_url: str, password: str): def _configure(step_config_dir): - conf_file = step_config_dir / 'config' / 'ca.json' + conf_file = step_config_dir / CA_CONFIG_JSON with open(conf_file, 'r+') as f: data = json.load(f) data.setdefault('authority', {}).setdefault('claims', {}) diff --git a/openfl/component/director/director.py b/openfl/component/director/director.py index 0ac6d70147..6409729dd9 100644 --- a/openfl/component/director/director.py +++ b/openfl/component/director/director.py @@ -16,13 +16,23 @@ logger = logging.getLogger(__name__) +ENVOY_HEALTH_CHECK_PERIOD = 60 # in seconds + class Director: """Director class.""" - def __init__(self, *, tls: bool = True, - root_certificate: Path = None, private_key: Path = None, certificate: Path = None, - sample_shape: list = None, target_shape: list = None) -> None: + def __init__( + self, + *, + tls: bool = True, + root_certificate: Path = None, + private_key: Path = None, + certificate: Path = None, + sample_shape: list = None, + target_shape: list = None, + settings: dict = None + ) -> None: """Initialize a director object.""" # TODO: add working directory super().__init__() @@ -40,6 +50,7 @@ def __init__(self, *, tls: bool = True, self.root_certificate = root_certificate self.private_key = private_key self.certificate = certificate + self.settings = settings or {} def acknowledge_shard(self, shard_info: director_pb2.ShardInfo) -> bool: """Save shard info to shard registry if it's acceptable.""" @@ -160,23 +171,21 @@ def remove_experiment_data(self, experiment_name: str, caller: str): if experiment_name in self.experiment_stash.get(caller, {}): del self.experiment_stash[caller][experiment_name] - def collaborator_health_check(self, *, collaborator_name: str, - is_experiment_running: bool, - valid_duration: int) -> bool: + def collaborator_health_check( + self, *, collaborator_name: str, is_experiment_running: bool + ) -> int: """Accept health check from envoy.""" - is_accepted = False shard_info = self._shard_registry.get(collaborator_name) if not shard_info: - logger.error(f'Unknown shard {collaborator_name}') - return is_accepted - is_accepted = True + raise Exception(f'Unknown shard {collaborator_name}') + hc_period = self.settings.get('envoy_health_check_period', ENVOY_HEALTH_CHECK_PERIOD) shard_info['is_online']: True shard_info['is_experiment_running'] = is_experiment_running - shard_info['valid_duration'] = valid_duration + shard_info['valid_duration'] = 2 * hc_period shard_info['last_updated'] = time.time() - return is_accepted + return hc_period def get_envoys(self) -> list: """Get a status information about envoys.""" diff --git a/openfl/component/envoy/envoy.py b/openfl/component/envoy/envoy.py index bbd4f26e76..26a4332478 100644 --- a/openfl/component/envoy/envoy.py +++ b/openfl/component/envoy/envoy.py @@ -17,7 +17,6 @@ logger = logging.getLogger(__name__) -DEFAULT_TIMEOUT_IN_SECONDS = 60 # TODO: make configurable DEFAULT_RETRY_TIMEOUT_IN_SECONDS = 5 @@ -56,7 +55,7 @@ def run(self): experiment_name = self.director_client.wait_experiment() data_stream = self.director_client.get_experiment_data(experiment_name) except Exception as exc: - logger.error(f'Failed to get experiment: {exc}') + logger.exception(f'Failed to get experiment: {exc}') time.sleep(DEFAULT_RETRY_TIMEOUT_IN_SECONDS) continue data_file_path = self._save_data_stream_to_file(data_stream) @@ -65,9 +64,9 @@ def run(self): with ExperimentWorkspace( experiment_name, data_file_path, is_install_requirements=True ): - self._run_collaborator(experiment_name) + self._run_collaborator() except Exception as exc: - logger.error(f'Collaborator failed: {exc}') + logger.exception(f'Collaborator failed with error: {exc}:') finally: # Workspace cleaning should not be done by gRPC client! self.is_experiment_running = False @@ -87,14 +86,13 @@ def send_health_check(self): """Send health check to the director.""" logger.info('The health check sender is started.') while True: - self.director_client.send_health_check( - self.name, - self.is_experiment_running, - DEFAULT_TIMEOUT_IN_SECONDS + timeout = self.director_client.send_health_check( + collaborator_name=self.name, + is_experiment_running=self.is_experiment_running ) - time.sleep(DEFAULT_TIMEOUT_IN_SECONDS / 2) + time.sleep(timeout) - def _run_collaborator(self, experiment_name, plan='plan/plan.yaml', ): + def _run_collaborator(self, plan='plan/plan.yaml'): """Run the collaborator for the experiment running.""" plan = Plan.parse(plan_config_path=Path(plan)) @@ -111,8 +109,7 @@ def start(self): try: is_accepted = self.director_client.report_shard_info(self.shard_descriptor) except Exception as exc: - logger.exception(str(exc)) - logger.exception('Failed to report shard info') + logger.exception(f'Failed to report shard info: {exc}') else: if is_accepted: # Shard accepted for participation in the federation diff --git a/openfl/federated/data/federated_data.py b/openfl/federated/data/federated_data.py index 948b16d864..c66c5df891 100644 --- a/openfl/federated/data/federated_data.py +++ b/openfl/federated/data/federated_data.py @@ -5,6 +5,8 @@ import numpy as np +from openfl.plugins.data_splitters import EqualNumPyDataSplitter +from openfl.plugins.data_splitters import NumPyDataSplitter from .loader_pt import PyTorchDataLoader @@ -29,8 +31,11 @@ class FederatedDataSet(PyTorchDataLoader): """ + train_splitter: NumPyDataSplitter + valid_splitter: NumPyDataSplitter + def __init__(self, X_train, y_train, X_valid, y_valid, - batch_size=1, num_classes=None, **kwargs): + batch_size=1, num_classes=None, train_splitter=None, valid_splitter=None): """ Initialize. @@ -47,6 +52,10 @@ def __init__(self, X_train, y_train, X_valid, y_valid, The batch size for the data loader num_classes : int The number of classes the model will be trained on + train_splitter: NumPyDataSplitter + Data splitter for train dataset. + valid_splitter: NumPyDataSplitter + Data splitter for validation dataset. **kwargs: Additional arguments to pass to the function """ @@ -61,8 +70,19 @@ def __init__(self, X_train, y_train, X_valid, y_valid, num_classes = np.unique(self.y_train).shape[0] print(f'Inferred {num_classes} classes from the provided labels...') self.num_classes = num_classes + self.train_splitter = self._get_splitter_or_default(train_splitter) + self.valid_splitter = self._get_splitter_or_default(valid_splitter) + + @staticmethod + def _get_splitter_or_default(value): + if value is None: + return EqualNumPyDataSplitter() + if isinstance(value, NumPyDataSplitter): + return value + else: + raise NotImplementedError(f'Data splitter {value} is not supported') - def split(self, num_collaborators, shuffle=True, equally=False): + def split(self, num_collaborators): """Create a Federated Dataset for each of the collaborators. Args: @@ -77,42 +97,18 @@ def split(self, num_collaborators, shuffle=True, equally=False): list[FederatedDataSets] A dataset slice for each collaborator """ - if shuffle: - train_shuffle = np.random.choice( - len(self.X_train), len(self.X_train), replace=False - ) - self.X_train = self.X_train[train_shuffle] - self.y_train = self.y_train[train_shuffle] - val_shuffle = np.random.choice( - len(self.X_valid), len(self.X_valid), replace=False - ) - self.X_valid = self.X_valid[val_shuffle] - self.y_valid = self.y_valid[val_shuffle] - - if equally: - X_train = np.array_split(self.X_train, num_collaborators) - y_train = np.array_split(self.y_train, num_collaborators) - X_valid = np.array_split(self.X_valid, num_collaborators) - y_valid = np.array_split(self.y_valid, num_collaborators) - else: - train_split = np.sort(np.random.choice( - len(self.X_train), num_collaborators - 1, replace=False) - ) - val_split = np.sort(np.random.choice( - len(self.X_val), num_collaborators - 1, replace=False) - ) - X_train = np.split(self.X_train, train_split) - y_train = np.split(self.y_train, train_split) - X_valid = np.split(self.X_valid, val_split) - y_valid = np.split(self.y_valid, val_split) + train_idx = self.train_splitter.split(self.y_train, num_collaborators) + valid_idx = self.valid_splitter.split(self.y_valid, num_collaborators) return [ FederatedDataSet( - X_train[i], - y_train[i], - X_valid[i], - y_valid[i], + self.X_train[train_idx[i]], + self.y_train[train_idx[i]], + self.X_valid[valid_idx[i]], + self.y_valid[valid_idx[i]], batch_size=self.batch_size, - num_classes=self.num_classes + num_classes=self.num_classes, + train_splitter=self.train_splitter, + valid_splitter=self.valid_splitter ) for i in range(num_collaborators) ] diff --git a/openfl/federated/plan/plan.py b/openfl/federated/plan/plan.py index 8d737dbbe4..69c26db4eb 100644 --- a/openfl/federated/plan/plan.py +++ b/openfl/federated/plan/plan.py @@ -147,9 +147,9 @@ def parse(plan_config_path: Path, cols_config_path: Path = None, return plan except Exception: - Plan.logger.error(f'Parsing Federated Learning Plan : ' - f'[red]FAILURE[/] : [blue]{plan_config_path}[/].', - extra={'markup': True}) + Plan.logger.exception(f'Parsing Federated Learning Plan : ' + f'[red]FAILURE[/] : [blue]{plan_config_path}[/].', + extra={'markup': True}) raise @staticmethod diff --git a/openfl/federated/task/fl_model.py b/openfl/federated/task/fl_model.py index 2592de1862..c1b565bdbe 100644 --- a/openfl/federated/task/fl_model.py +++ b/openfl/federated/task/fl_model.py @@ -96,6 +96,4 @@ def setup(self, num_collaborators, **kwargs): data_loader=data_slice, **kwargs ) - for data_slice in self.data_loader.split( - num_collaborators, equally=True - )] + for data_slice in self.data_loader.split(num_collaborators)] diff --git a/openfl/federated/task/runner_keras.py b/openfl/federated/task/runner_keras.py index d03c04e3a4..93ac40b1d0 100644 --- a/openfl/federated/task/runner_keras.py +++ b/openfl/federated/task/runner_keras.py @@ -60,9 +60,10 @@ def rebuild_model(self, round_num, input_tensor_dict, validation=False): else: self.set_tensor_dict(input_tensor_dict, with_opt_vars=False) - def train(self, col_name, round_num, input_tensor_dict, metrics, num_batches=None, **kwargs): + def train(self, col_name, round_num, input_tensor_dict, + metrics, epochs=1, batch_size=1, **kwargs): """ - Perform the training for a specified number of batches. + Perform the training. Is expected to perform draws randomly, without replacement until data is exausted. Then data is replaced and shuffled and draws continue. @@ -77,10 +78,11 @@ def train(self, col_name, round_num, input_tensor_dict, metrics, num_batches=Non # rebuild model with updated weights self.rebuild_model(round_num, input_tensor_dict) - - results = self.train_iteration(self.data_loader.get_train_loader(num_batches), - metrics=metrics, - **kwargs) + for epoch in range(epochs): + self.logger.info(f'Run {epoch} epoch of {round_num} round') + results = self.train_iteration(self.data_loader.get_train_loader(batch_size), + metrics=metrics, + **kwargs) # output metric tensors (scalar) origin = col_name diff --git a/openfl/federated/task/runner_pt.py b/openfl/federated/task/runner_pt.py index 8d6d235e25..2905f09798 100644 --- a/openfl/federated/task/runner_pt.py +++ b/openfl/federated/task/runner_pt.py @@ -129,7 +129,7 @@ def validate(self, col_name, round_num, input_tensor_dict, return output_tensor_dict, {} def train_batches(self, col_name, round_num, input_tensor_dict, - num_batches=None, use_tqdm=False, **kwargs): + use_tqdm=False, epochs=1, **kwargs): """Train batches. Train the model on the requested number of batches. @@ -138,9 +138,8 @@ def train_batches(self, col_name, round_num, input_tensor_dict, col_name: Name of the collaborator round_num: What round is it input_tensor_dict: Required input tensors (for model) - num_batches: The number of batches to train on before - returning use_tqdm (bool): Use tqdm to print a progress bar (Default=True) + epochs: The number of epochs to train Returns: global_output_dict: Tensors to send back to the aggregator @@ -150,10 +149,12 @@ def train_batches(self, col_name, round_num, input_tensor_dict, # set to "training" mode self.train() self.to(self.device) - loader = self.data_loader.get_train_loader(num_batches) - if use_tqdm: - loader = tqdm.tqdm(loader, desc='train epoch') - metric = self.train_epoch(loader) + for epoch in range(epochs): + self.logger.info(f'Run {epoch} epoch of {round_num} round') + loader = self.data_loader.get_train_loader() + if use_tqdm: + loader = tqdm.tqdm(loader, desc='train epoch') + metric = self.train_epoch(loader) # Output metric tensors (scalar) origin = col_name tags = ('trained',) diff --git a/openfl/federated/task/runner_tf.py b/openfl/federated/task/runner_tf.py index a3f09cf99d..620a0ca0dd 100644 --- a/openfl/federated/task/runner_tf.py +++ b/openfl/federated/task/runner_tf.py @@ -84,17 +84,17 @@ def rebuild_model(self, round_num, input_tensor_dict, validation=False): self.set_tensor_dict(input_tensor_dict, with_opt_vars=False) def train_batches(self, col_name, round_num, input_tensor_dict, - num_batches, use_tqdm=False, **kwargs): + epochs=1, use_tqdm=False, **kwargs): """ - Perform the training for a specified number of batches. + Perform the training. Is expected to perform draws randomly, without replacement until data is exausted. Then data is replaced and shuffled and draws continue. Args: - num_batches: Number of batches to train on use_tqdm (bool): True = use tqdm to print a progress bar (Default=False) + epochs (int): Number of epochs to train Returns: float: loss metric """ @@ -107,22 +107,17 @@ def train_batches(self, col_name, round_num, input_tensor_dict, self.rebuild_model(round_num, input_tensor_dict) tf.keras.backend.set_learning_phase(True) - losses = [] - batch_num = 0 - while batch_num < num_batches: + for epoch in range(epochs): + self.logger.info(f'Run {epoch} epoch of {round_num} round') # get iterator for batch draws (shuffling happens here) gen = self.data_loader.get_train_loader(batch_size) if use_tqdm: gen = tqdm.tqdm(gen, desc='training epoch') for (X, y) in gen: - if batch_num >= num_batches: - break - else: - losses.append(self.train_batch(X, y)) - batch_num += 1 + losses.append(self.train_batch(X, y)) # Output metric tensors (scalar) origin = col_name diff --git a/openfl/interface/aggregator.py b/openfl/interface/aggregator.py index 2abd4f0233..df37f85feb 100644 --- a/openfl/interface/aggregator.py +++ b/openfl/interface/aggregator.py @@ -62,7 +62,7 @@ def generate_cert_request(fqdn): from openfl.cryptography.participant import generate_csr from openfl.cryptography.io import write_crt from openfl.cryptography.io import write_key - from openfl.interface.cli_helper import PKI_DIR + from openfl.interface.cli_helper import CERT_DIR if fqdn is None: fqdn = getfqdn_env() @@ -77,14 +77,14 @@ def generate_cert_request(fqdn): server_private_key, server_csr = generate_csr(common_name, server=True) - (PKI_DIR / 'server').mkdir(parents=True, exist_ok=True) + (CERT_DIR / 'server').mkdir(parents=True, exist_ok=True) echo(' Writing AGGREGATOR certificate key pair to: ' + style( - f'{PKI_DIR}/server', fg='green')) + f'{CERT_DIR}/server', fg='green')) # Write aggregator csr and key to disk - write_crt(server_csr, PKI_DIR / 'server' / f'{file_name}.csr') - write_key(server_private_key, PKI_DIR / 'server' / f'{file_name}.key') + write_crt(server_csr, CERT_DIR / 'server' / f'{file_name}.csr') + write_key(server_private_key, CERT_DIR / 'server' / f'{file_name}.key') # TODO: function not used @@ -119,7 +119,7 @@ def certify(fqdn, silent): from openfl.cryptography.io import read_csr from openfl.cryptography.io import read_key from openfl.cryptography.io import write_crt - from openfl.interface.cli_helper import PKI_DIR + from openfl.interface.cli_helper import CERT_DIR if fqdn is None: fqdn = getfqdn_env() @@ -131,28 +131,28 @@ def certify(fqdn, silent): signing_crt_path = 'ca/signing-ca.crt' # Load CSR - if not Path(PKI_DIR / f'{cert_name}.csr').exists(): + if not Path(CERT_DIR / f'{cert_name}.csr').exists(): echo(style('Aggregator certificate signing request not found.', fg='red') + ' Please run `fx aggregator generate-cert-request`' - ' to generate the certificate request.') + ' to generate the certificate request.') - csr, csr_hash = read_csr(PKI_DIR / f'{cert_name}.csr') + csr, csr_hash = read_csr(CERT_DIR / f'{cert_name}.csr') # Load private signing key - if not Path(PKI_DIR / signing_key_path).exists(): + if not Path(CERT_DIR / signing_key_path).exists(): echo(style('Signing key not found.', fg='red') + ' Please run `fx workspace certify`' - ' to initialize the local certificate authority.') + ' to initialize the local certificate authority.') - signing_key = read_key(PKI_DIR / signing_key_path) + signing_key = read_key(CERT_DIR / signing_key_path) # Load signing cert - if not Path(PKI_DIR / signing_crt_path).exists(): + if not Path(CERT_DIR / signing_crt_path).exists(): echo(style('Signing certificate not found.', fg='red') + ' Please run `fx workspace certify`' - ' to initialize the local certificate authority.') + ' to initialize the local certificate authority.') - signing_crt = read_crt(PKI_DIR / signing_crt_path) + signing_crt = read_crt(CERT_DIR / signing_crt_path) echo('The CSR Hash for file ' + style(f'{cert_name}.csr', fg='green') @@ -163,7 +163,7 @@ def certify(fqdn, silent): echo(' Signing AGGREGATOR certificate') signed_agg_cert = sign_certificate(csr, signing_key, signing_crt.subject) - write_crt(signed_agg_cert, PKI_DIR / f'{cert_name}.crt') + write_crt(signed_agg_cert, CERT_DIR / f'{cert_name}.crt') else: @@ -171,7 +171,7 @@ def certify(fqdn, silent): echo(' Signing AGGREGATOR certificate') signed_agg_cert = sign_certificate(csr, signing_key, signing_crt.subject) - write_crt(signed_agg_cert, PKI_DIR / f'{cert_name}.crt') + write_crt(signed_agg_cert, CERT_DIR / f'{cert_name}.crt') else: echo(style('Not signing certificate.', fg='red') diff --git a/openfl/interface/cli_helper.py b/openfl/interface/cli_helper.py index 18ab62294d..e4f02b1fdb 100644 --- a/openfl/interface/cli_helper.py +++ b/openfl/interface/cli_helper.py @@ -18,8 +18,8 @@ SITEPACKS = Path(__file__).parent.parent.parent WORKSPACE = SITEPACKS / 'openfl-workspace' TUTORIALS = SITEPACKS / 'openfl-tutorials' -PKI_DIR = Path('cert') OPENFL_USERDIR = Path.home() / '.openfl' +CERT_DIR = Path('cert') def pretty(o): diff --git a/openfl/interface/collaborator.py b/openfl/interface/collaborator.py index fdb49f4e20..4777540301 100644 --- a/openfl/interface/collaborator.py +++ b/openfl/interface/collaborator.py @@ -119,7 +119,7 @@ def generate_cert_request(collaborator_name, data_path, silent, skip_package): from openfl.cryptography.participant import generate_csr from openfl.cryptography.io import write_crt from openfl.cryptography.io import write_key - from openfl.interface.cli_helper import PKI_DIR + from openfl.interface.cli_helper import CERT_DIR common_name = f'{collaborator_name}'.lower() subject_alternative_name = f'DNS:{common_name}' @@ -131,14 +131,14 @@ def generate_cert_request(collaborator_name, data_path, silent, skip_package): client_private_key, client_csr = generate_csr(common_name, server=False) - (PKI_DIR / 'client').mkdir(parents=True, exist_ok=True) + (CERT_DIR / 'client').mkdir(parents=True, exist_ok=True) echo(' Moving COLLABORATOR certificate to: ' + style( - f'{PKI_DIR}/{file_name}', fg='green')) + f'{CERT_DIR}/{file_name}', fg='green')) # Write collaborator csr and key to disk - write_crt(client_csr, PKI_DIR / 'client' / f'{file_name}.csr') - write_key(client_private_key, PKI_DIR / 'client' / f'{file_name}.key') + write_crt(client_csr, CERT_DIR / 'client' / f'{file_name}.csr') + write_key(client_private_key, CERT_DIR / 'client' / f'{file_name}.key') if not skip_package: from shutil import copytree @@ -159,7 +159,7 @@ def generate_cert_request(collaborator_name, data_path, silent, skip_package): ignore = ignore_patterns('__pycache__', '*.key', '*.srl', '*.pem') # Copy the current directory into the temporary directory - copytree(f'{PKI_DIR}/client', tmp_dir, ignore=ignore) + copytree(f'{CERT_DIR}/client', tmp_dir, ignore=ignore) for f in glob(f'{tmp_dir}/*'): if common_name not in basename(f): @@ -266,15 +266,15 @@ def certify(collaborator_name, silent, request_pkg=False, import_=False): from openfl.cryptography.io import read_csr from openfl.cryptography.io import read_key from openfl.cryptography.io import write_crt - from openfl.interface.cli_helper import PKI_DIR + from openfl.interface.cli_helper import CERT_DIR common_name = f'{collaborator_name}'.lower() if not import_: if request_pkg: - Path(f'{PKI_DIR}/client').mkdir(parents=True, exist_ok=True) - unpack_archive(request_pkg, extract_dir=f'{PKI_DIR}/client') - csr = glob(f'{PKI_DIR}/client/*.csr')[0] + Path(f'{CERT_DIR}/client').mkdir(parents=True, exist_ok=True) + unpack_archive(request_pkg, extract_dir=f'{CERT_DIR}/client') + csr = glob(f'{CERT_DIR}/client/*.csr')[0] else: if collaborator_name is None: echo('collaborator_name can only be omitted if signing\n' @@ -283,8 +283,8 @@ def certify(collaborator_name, silent, request_pkg=False, import_=False): 'Example: fx collaborator certify --request-pkg ' 'col_one_to_agg_cert_request.zip') return - csr = glob(f'{PKI_DIR}/client/col_{common_name}.csr')[0] - copy(csr, PKI_DIR) + csr = glob(f'{CERT_DIR}/client/col_{common_name}.csr')[0] + copy(csr, CERT_DIR) cert_name = splitext(csr)[0] file_name = basename(cert_name) signing_key_path = 'ca/signing-ca/private/signing-ca.key' @@ -299,20 +299,20 @@ def certify(collaborator_name, silent, request_pkg=False, import_=False): csr, csr_hash = read_csr(f'{cert_name}.csr') # Load private signing key - if not Path(PKI_DIR / signing_key_path).exists(): + if not Path(CERT_DIR / signing_key_path).exists(): echo(style('Signing key not found.', fg='red') + ' Please run `fx workspace certify`' ' to initialize the local certificate authority.') - signing_key = read_key(PKI_DIR / signing_key_path) + signing_key = read_key(CERT_DIR / signing_key_path) # Load signing cert - if not Path(PKI_DIR / signing_crt_path).exists(): + if not Path(CERT_DIR / signing_crt_path).exists(): echo(style('Signing certificate not found.', fg='red') + ' Please run `fx workspace certify`' ' to initialize the local certificate authority.') - signing_crt = read_crt(PKI_DIR / signing_crt_path) + signing_crt = read_crt(CERT_DIR / signing_crt_path) echo('The CSR Hash for file ' + style(f'{file_name}.csr', fg='green') @@ -324,7 +324,7 @@ def certify(collaborator_name, silent, request_pkg=False, import_=False): echo(' Signing COLLABORATOR certificate') signed_col_cert = sign_certificate(csr, signing_key, signing_crt.subject) write_crt(signed_col_cert, f'{cert_name}.crt') - register_collaborator(PKI_DIR / 'client' / f'{file_name}.crt') + register_collaborator(CERT_DIR / 'client' / f'{file_name}.crt') else: @@ -333,7 +333,7 @@ def certify(collaborator_name, silent, request_pkg=False, import_=False): echo(' Signing COLLABORATOR certificate') signed_col_cert = sign_certificate(csr, signing_key, signing_crt.subject) write_crt(signed_col_cert, f'{cert_name}.crt') - register_collaborator(PKI_DIR / 'client' / f'{file_name}.crt') + register_collaborator(CERT_DIR / 'client' / f'{file_name}.crt') else: echo(style('Not signing certificate.', fg='red') @@ -357,18 +357,18 @@ def certify(collaborator_name, silent, request_pkg=False, import_=False): Path(f'{tmp_dir}/client').mkdir(parents=True, exist_ok=True) # Copy the signed cert to the temporary directory - copy(f'{PKI_DIR}/client/{file_name}.crt', f'{tmp_dir}/client/') + copy(f'{CERT_DIR}/client/{file_name}.crt', f'{tmp_dir}/client/') # Copy the CA certificate chain to the temporary directory - copy(f'{PKI_DIR}/cert_chain.crt', tmp_dir) + copy(f'{CERT_DIR}/cert_chain.crt', tmp_dir) # Create Zip archive of directory make_archive(archive_name, archive_type, tmp_dir) else: # Copy the signed certificate and cert chain into PKI_DIR - previous_crts = glob(f'{PKI_DIR}/client/*.crt') - unpack_archive(import_, extract_dir=PKI_DIR) - updated_crts = glob(f'{PKI_DIR}/client/*.crt') + previous_crts = glob(f'{CERT_DIR}/client/*.crt') + unpack_archive(import_, extract_dir=CERT_DIR) + updated_crts = glob(f'{CERT_DIR}/client/*.crt') cert_difference = list(set(updated_crts) - set(previous_crts)) if len(cert_difference) != 0: crt = basename(cert_difference[0]) diff --git a/openfl/interface/director.py b/openfl/interface/director.py index bd06dbad6d..822519455e 100644 --- a/openfl/interface/director.py +++ b/openfl/interface/director.py @@ -48,14 +48,14 @@ def start(director_config_path, tls, root_certificate, private_key, certificate) sample_shape = settings.get('sample_shape', '') target_shape = settings.get('target_shape', '') logger.info(f'Sample shape: {sample_shape}, target shape: {target_shape}') - listen_addr = settings.get('listen_addr') + listen_host = settings.get('listen_host') listen_port = settings.get('listen_port') root_certificate = root_certificate or settings.get('root_certificate') private_key = private_key or settings.get('private_key') certificate = certificate or settings.get('certificate') kwargs = {} - if listen_addr: - kwargs['listen_addr'] = listen_addr + if listen_host: + kwargs['listen_host'] = listen_host if listen_port: kwargs['listen_port'] = listen_port director_server = DirectorGRPCServer( @@ -66,6 +66,7 @@ def start(director_config_path, tls, root_certificate, private_key, certificate) root_certificate=root_certificate, private_key=private_key, certificate=certificate, + settings=settings, **kwargs ) director_server.start() diff --git a/openfl/interface/pki.py b/openfl/interface/pki.py index cd66237d20..2372ee4fc4 100644 --- a/openfl/interface/pki.py +++ b/openfl/interface/pki.py @@ -12,6 +12,10 @@ from click import password_option from click import Path as ClickPath +from openfl.component.ca.ca import CA_CONFIG_JSON +from openfl.component.ca.ca import CA_PASSWORD_FILE +from openfl.component.ca.ca import CA_PKI_DIR +from openfl.component.ca.ca import CA_STEP_CONFIG_DIR from openfl.component.ca.ca import certify from openfl.component.ca.ca import get_ca_bin_paths from openfl.component.ca.ca import get_token @@ -19,7 +23,6 @@ from openfl.component.ca.ca import remove_ca from openfl.component.ca.ca import run_ca - logger = logging.getLogger(__name__) CA_URL = 'localhost:9123' @@ -38,17 +41,17 @@ def pki(context): def run(ca_path): """Run CA server.""" ca_path = Path(ca_path) - step_config_dir = ca_path / 'step_config' - pki_dir = ca_path / 'cert' - pass_file = pki_dir / 'pass_file' - ca_json = step_config_dir / 'config' / 'ca.json' + step_config_dir = ca_path / CA_STEP_CONFIG_DIR + pki_dir = ca_path / CA_PKI_DIR + password_file = pki_dir / CA_PASSWORD_FILE + ca_json = step_config_dir / CA_CONFIG_JSON _, step_ca_path = get_ca_bin_paths(ca_path) if (not os.path.exists(step_config_dir) or not os.path.exists(pki_dir) - or not os.path.exists(pass_file) or not os.path.exists(ca_json) + or not os.path.exists(password_file) or not os.path.exists(ca_json) or not os.path.exists(step_ca_path)): logger.warning('CA is not installed or corrupted, please install it first') return - run_ca(step_ca_path, pass_file, ca_json) + run_ca(step_ca_path, password_file, ca_json) @pki.command(name='install') diff --git a/openfl/interface/workspace.py b/openfl/interface/workspace.py index 33313a6893..0e7cff50a1 100644 --- a/openfl/interface/workspace.py +++ b/openfl/interface/workspace.py @@ -216,27 +216,27 @@ def certify(): from openfl.cryptography.ca import generate_root_cert from openfl.cryptography.ca import generate_signing_csr from openfl.cryptography.ca import sign_certificate - from openfl.interface.cli_helper import PKI_DIR + from openfl.interface.cli_helper import CERT_DIR echo('Setting Up Certificate Authority...\n') echo('1. Create Root CA') echo('1.1 Create Directories') - (PKI_DIR / 'ca/root-ca/private').mkdir( + (CERT_DIR / 'ca/root-ca/private').mkdir( parents=True, exist_ok=True, mode=0o700) - (PKI_DIR / 'ca/root-ca/db').mkdir(parents=True, exist_ok=True) + (CERT_DIR / 'ca/root-ca/db').mkdir(parents=True, exist_ok=True) echo('1.2 Create Database') - with open(PKI_DIR / 'ca/root-ca/db/root-ca.db', 'w') as f: + with open(CERT_DIR / 'ca/root-ca/db/root-ca.db', 'w') as f: pass # write empty file - with open(PKI_DIR / 'ca/root-ca/db/root-ca.db.attr', 'w') as f: + with open(CERT_DIR / 'ca/root-ca/db/root-ca.db.attr', 'w') as f: pass # write empty file - with open(PKI_DIR / 'ca/root-ca/db/root-ca.crt.srl', 'w') as f: + with open(CERT_DIR / 'ca/root-ca/db/root-ca.crt.srl', 'w') as f: f.write('01') # write file with '01' - with open(PKI_DIR / 'ca/root-ca/db/root-ca.crl.srl', 'w') as f: + with open(CERT_DIR / 'ca/root-ca/db/root-ca.crl.srl', 'w') as f: f.write('01') # write file with '01' echo('1.3 Create CA Request and Certificate') @@ -247,12 +247,12 @@ def certify(): root_private_key, root_cert = generate_root_cert() # Write root CA certificate to disk - with open(PKI_DIR / root_crt_path, 'wb') as f: + with open(CERT_DIR / root_crt_path, 'wb') as f: f.write(root_cert.public_bytes( encoding=serialization.Encoding.PEM, )) - with open(PKI_DIR / root_key_path, 'wb') as f: + with open(CERT_DIR / root_key_path, 'wb') as f: f.write(root_private_key.private_bytes( encoding=serialization.Encoding.PEM, format=serialization.PrivateFormat.TraditionalOpenSSL, @@ -262,20 +262,20 @@ def certify(): echo('2. Create Signing Certificate') echo('2.1 Create Directories') - (PKI_DIR / 'ca/signing-ca/private').mkdir( + (CERT_DIR / 'ca/signing-ca/private').mkdir( parents=True, exist_ok=True, mode=0o700) - (PKI_DIR / 'ca/signing-ca/db').mkdir(parents=True, exist_ok=True) + (CERT_DIR / 'ca/signing-ca/db').mkdir(parents=True, exist_ok=True) echo('2.2 Create Database') - with open(PKI_DIR / 'ca/signing-ca/db/signing-ca.db', 'w') as f: + with open(CERT_DIR / 'ca/signing-ca/db/signing-ca.db', 'w') as f: pass # write empty file - with open(PKI_DIR / 'ca/signing-ca/db/signing-ca.db.attr', 'w') as f: + with open(CERT_DIR / 'ca/signing-ca/db/signing-ca.db.attr', 'w') as f: pass # write empty file - with open(PKI_DIR / 'ca/signing-ca/db/signing-ca.crt.srl', 'w') as f: + with open(CERT_DIR / 'ca/signing-ca/db/signing-ca.crt.srl', 'w') as f: f.write('01') # write file with '01' - with open(PKI_DIR / 'ca/signing-ca/db/signing-ca.crl.srl', 'w') as f: + with open(CERT_DIR / 'ca/signing-ca/db/signing-ca.crl.srl', 'w') as f: f.write('01') # write file with '01' echo('2.3 Create Signing Certificate CSR') @@ -287,12 +287,12 @@ def certify(): signing_private_key, signing_csr = generate_signing_csr() # Write Signing CA CSR to disk - with open(PKI_DIR / signing_csr_path, 'wb') as f: + with open(CERT_DIR / signing_csr_path, 'wb') as f: f.write(signing_csr.public_bytes( encoding=serialization.Encoding.PEM, )) - with open(PKI_DIR / signing_key_path, 'wb') as f: + with open(CERT_DIR / signing_key_path, 'wb') as f: f.write(signing_private_key.private_bytes( encoding=serialization.Encoding.PEM, format=serialization.PrivateFormat.TraditionalOpenSSL, @@ -303,7 +303,7 @@ def certify(): signing_cert = sign_certificate(signing_csr, root_private_key, root_cert.subject, ca=True) - with open(PKI_DIR / signing_crt_path, 'wb') as f: + with open(CERT_DIR / signing_crt_path, 'wb') as f: f.write(signing_cert.public_bytes( encoding=serialization.Encoding.PEM, )) @@ -311,10 +311,10 @@ def certify(): echo('3 Create Certificate Chain') # create certificate chain file by combining root-ca and signing-ca - with open(PKI_DIR / 'cert_chain.crt', 'w') as d: - with open(PKI_DIR / 'ca/root-ca.crt') as s: + with open(CERT_DIR / 'cert_chain.crt', 'w') as d: + with open(CERT_DIR / 'ca/root-ca.crt') as s: d.write(s.read()) - with open(PKI_DIR / 'ca/signing-ca.crt') as s: + with open(CERT_DIR / 'ca/signing-ca.crt') as s: d.write(s.read()) echo('\nDone.') @@ -381,8 +381,10 @@ def dockerize_(context, base_image, save): context.invoke(export_) workspace_archive = workspace_name + '.zip' - build_args = {'WORKSPACE_NAME': workspace_name, - 'BASE_IMAGE': base_image} + build_args = { + 'WORKSPACE_NAME': workspace_name, + 'BASE_IMAGE': base_image + } client = docker.from_env(timeout=3600) try: @@ -394,7 +396,7 @@ def dockerize_(context, base_image, save): dockerfile=dockerfile_workspace) except Exception as e: - echo('Faild to build the image\n' + str(e) + '\n') + echo('Failed to build the image\n' + str(e) + '\n') sys.exit(1) else: echo('The workspace image has been built successfully!') diff --git a/openfl/plugins/data_splitters/__init__.py b/openfl/plugins/data_splitters/__init__.py new file mode 100644 index 0000000000..b52ae9ded6 --- /dev/null +++ b/openfl/plugins/data_splitters/__init__.py @@ -0,0 +1,16 @@ +"""openfl.plugins.data package.""" +from openfl.plugins.data_splitters.data_splitter import DataSplitter +from openfl.plugins.data_splitters.numpy import DirichletNumPyDataSplitter +from openfl.plugins.data_splitters.numpy import EqualNumPyDataSplitter +from openfl.plugins.data_splitters.numpy import LogNormalNumPyDataSplitter +from openfl.plugins.data_splitters.numpy import NumPyDataSplitter +from openfl.plugins.data_splitters.numpy import RandomNumPyDataSplitter + +__all__ = [ + 'DataSplitter', + 'DirichletNumPyDataSplitter', + 'EqualNumPyDataSplitter', + 'LogNormalNumPyDataSplitter', + 'NumPyDataSplitter', + 'RandomNumPyDataSplitter', +] diff --git a/openfl/plugins/data_splitters/data_splitter.py b/openfl/plugins/data_splitters/data_splitter.py new file mode 100644 index 0000000000..12a93cea8c --- /dev/null +++ b/openfl/plugins/data_splitters/data_splitter.py @@ -0,0 +1,17 @@ +"""openfl.plugins.data_splitters.data_splitter module.""" +from abc import ABC +from abc import abstractmethod +from typing import Iterable +from typing import List +from typing import TypeVar + +T = TypeVar('T') + + +class DataSplitter(ABC): + """Base class for data splitting.""" + + @abstractmethod + def split(self, data: Iterable[T], num_collaborators: int) -> List[Iterable[T]]: + """Split the data.""" + raise NotImplementedError diff --git a/openfl/plugins/data_splitters/numpy.py b/openfl/plugins/data_splitters/numpy.py new file mode 100644 index 0000000000..d416cc0fd9 --- /dev/null +++ b/openfl/plugins/data_splitters/numpy.py @@ -0,0 +1,161 @@ +"""UnbalancedFederatedDataset module.""" + +from abc import abstractmethod +from typing import List + +import numpy as np +from tqdm import trange + +from openfl.plugins.data_splitters.data_splitter import DataSplitter + + +def get_label_count(labels, label): + """Count samples with label `label` in `labels` array.""" + return len(np.nonzero(labels == label)[0]) + + +def one_hot(labels, classes): + """Apply One-Hot encoding to labels.""" + return np.eye(classes)[labels] + + +class NumPyDataSplitter(DataSplitter): + """Base class for splitting numpy arrays of data.""" + + @abstractmethod + def split(self, data: np.ndarray, num_collaborators: int) -> List[List[int]]: + """Split the data.""" + raise NotImplementedError + + +class EqualNumPyDataSplitter(NumPyDataSplitter): + """Splits the data evenly.""" + + def __init__(self, shuffle=True): + """Initialize. + + Args: + shuffle(bool): Flag determining whether to shuffle the dataset before splitting. + """ + self.shuffle = shuffle + + def split(self, data, num_collaborators): + """Split the data.""" + idx = range(len(data)) + if self.shuffle: + idx = np.random.permutation(idx) + slices = np.array_split(idx, num_collaborators) + return slices + + +class RandomNumPyDataSplitter(NumPyDataSplitter): + """Splits the data randomly.""" + + def __init__(self, shuffle=True): + """Initialize. + + Args: + shuffle(bool): Flag determining whether to shuffle the dataset before splitting. + """ + self.shuffle = shuffle + + def split(self, data, num_collaborators): + """Split the data.""" + idx = range(len(data)) + if self.shuffle: + idx = np.random.permutation(idx) + random_idx = np.sort(np.random.choice(len(data), num_collaborators - 1, replace=False)) + + return np.split(idx, random_idx) + + +class LogNormalNumPyDataSplitter(NumPyDataSplitter): + """Unbalanced (LogNormal) dataset split.""" + + def __init__(self, mu, + sigma, + num_classes, + classes_per_col, + min_samples_per_class): + """Initialize. + + Args: + mu(float): Distribution hyperparameter. + sigma(float): Distribution hyperparameter. + classes_per_col(int): Number of classes assigned to each collaborator. + min_samples_per_class(int): Minimum number of collaborator samples of each class. + """ + self.mu = mu + self.sigma = sigma + self.num_classes = num_classes + self.classes_per_col = classes_per_col + self.min_samples_per_class = min_samples_per_class + + def split(self, data, num_collaborators): + """Split the data.""" + idx = [[] for _ in range(num_collaborators)] + samples_per_col = self.classes_per_col * self.min_samples_per_class + for col in range(num_collaborators): + for c in range(self.classes_per_col): + label = (col + c) % self.num_classes + label_idx = np.nonzero(data == label)[0] + slice_start = col // self.num_classes * samples_per_col + slice_start += self.min_samples_per_class * c + slice_end = slice_start + self.min_samples_per_class + print(f'Assigning {slice_start}:{slice_end} of {label} class to {col} col...') + idx[col] += list(label_idx[slice_start:slice_end]) + assert all([len(i) == samples_per_col for i in idx]), f''' +All collaborators should have {samples_per_col} elements +but distribution is {[len(i) for i in idx]}''' + + props_shape = (self.num_classes, num_collaborators // 10, self.classes_per_col) + props = np.random.lognormal(self.mu, self.sigma, props_shape) + num_samples_per_class = [[[get_label_count(data, label) - self.min_samples_per_class]] + for label in range(self.num_classes)] + num_samples_per_class = np.array(num_samples_per_class) + props = num_samples_per_class * props / np.sum(props, (1, 2), keepdims=True) + for col in trange(num_collaborators): + for j in range(self.classes_per_col): + label = (col + j) % self.num_classes + num_samples = int(props[label, col // 10, j]) + + print(f'Trying to append {num_samples} of {label} class to {col} col...') + slice_start = np.count_nonzero(data[np.hstack(idx)] == label) + slice_end = slice_start + num_samples + if slice_end < get_label_count(data, label): + label_subset = np.nonzero(data == (col + j) % self.num_classes)[0] + idx_to_append = label_subset[slice_start:slice_end] + print(f'Appending {idx_to_append} of {label} class to {col} col...') + idx[col] = np.append(idx[col], idx_to_append) + return idx + + +class DirichletNumPyDataSplitter(NumPyDataSplitter): + """Numpy splitter according to dirichlet distribution.""" + + def __init__(self, alpha=0.5, min_samples_per_col=10): + """Initialize.""" + self.alpha = alpha + self.min_samples_per_col = min_samples_per_col + + def split(self, data, num_collaborators): + """Split the data.""" + classes = len(np.unique(data)) + min_size = 0 + + n = len(data) + while min_size < self.min_samples_per_col: + idx_batch = [[] for _ in range(num_collaborators)] + for k in range(classes): + idx_k = np.where(data == k)[0] + np.random.shuffle(idx_k) + proportions = np.random.dirichlet(np.repeat(self.alpha, num_collaborators)) + proportions = [p * (len(idx_j) < n / num_collaborators) + for p, idx_j in zip(proportions, idx_batch)] + proportions = np.array(proportions) + proportions = proportions / proportions.sum() + proportions = (np.cumsum(proportions) * len(idx_k)).astype(int)[:-1] + idx_splitted = np.split(idx_k, proportions) + idx_batch = [idx_j + idx.tolist() for idx_j, idx in zip(idx_batch, idx_splitted)] + min_size = min([len(idx_j) for idx_j in idx_batch]) + return idx_batch diff --git a/openfl/protocols/director.proto b/openfl/protocols/director.proto index 9d9156b873..0c4832cb7f 100644 --- a/openfl/protocols/director.proto +++ b/openfl/protocols/director.proto @@ -109,11 +109,10 @@ message RemoveExperimentResponse { message CollaboratorStatus { string name = 1; bool is_experiment_running = 2; - google.protobuf.Duration valid_duration = 3; } message CollaboratorHealthCheckResponse { - bool accepted = 1; + google.protobuf.Duration health_check_period = 1; } message EnvoyInfo { diff --git a/openfl/protocols/director_pb2.py b/openfl/protocols/director_pb2.py index adbaaee0f0..ede1f6d9e4 100644 --- a/openfl/protocols/director_pb2.py +++ b/openfl/protocols/director_pb2.py @@ -22,7 +22,7 @@ syntax='proto3', serialized_options=None, create_key=_descriptor._internal_create_key, - serialized_pb=b'\n\x0e\x64irector.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1egoogle/protobuf/duration.proto\x1a\x10\x66\x65\x64\x65ration.proto\"\x1f\n\rRequestHeader\x12\x0e\n\x06sender\x18\x01 \x01(\t\"U\n\x08NodeInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0e\n\x06\x61\x64ress\x18\x02 \x01(\t\x12\x16\n\x0e\x63uda_available\x18\x03 \x01(\x08\x12\x13\n\x0bmemory_size\x18\x04 \x01(\r\"\x83\x01\n\tShardInfo\x12\x1c\n\tnode_info\x18\x01 \x01(\x0b\x32\t.NodeInfo\x12\x19\n\x11shard_description\x18\x02 \x01(\t\x12\x11\n\tn_samples\x18\x03 \x01(\x04\x12\x14\n\x0csample_shape\x18\x04 \x03(\t\x12\x14\n\x0ctarget_shape\x18\x05 \x03(\t\"(\n\x14ShardAcknowledgement\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\"2\n\x15WaitExperimentRequest\x12\x19\n\x11\x63ollaborator_name\x18\x01 \x01(\t\"1\n\x16WaitExperimentResponse\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\"N\n\x18GetExperimentDataRequest\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\x12\x19\n\x11\x63ollaborator_name\x18\x02 \x01(\t\"/\n\x0e\x45xperimentData\x12\x0c\n\x04size\x18\x01 \x01(\r\x12\x0f\n\x07npbytes\x18\x02 \x01(\x0c\"\xa6\x01\n\x0e\x45xperimentInfo\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x1a\n\x12\x63ollaborator_names\x18\x03 \x03(\t\x12(\n\x0f\x65xperiment_data\x18\x04 \x01(\x0b\x32\x0f.ExperimentData\x12 \n\x0bmodel_proto\x18\x05 \x01(\x0b\x32\x0b.ModelProto\"I\n\x18SetNewExperimentResponse\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\x12\x1b\n\x13tensorboard_address\x18\x02 \x01(\t\"\xb5\x01\n\x16GetTrainedModelRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\x12\x35\n\nmodel_type\x18\x03 \x01(\x0e\x32!.GetTrainedModelRequest.ModelType\"+\n\tModelType\x12\x0e\n\nBEST_MODEL\x10\x00\x12\x0e\n\nLAST_MODEL\x10\x01\"8\n\x14TrainedModelResponse\x12 \n\x0bmodel_proto\x18\x01 \x01(\x0b\x32\x0b.ModelProto\"7\n\x15GetDatasetInfoRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\"O\n\x14StreamMetricsRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"{\n\x15StreamMetricsResponse\x12\x15\n\rmetric_origin\x18\x01 \x01(\t\x12\x11\n\ttask_name\x18\x02 \x01(\t\x12\x13\n\x0bmetric_name\x18\x03 \x01(\t\x12\x14\n\x0cmetric_value\x18\x04 \x01(\x02\x12\r\n\x05round\x18\x05 \x01(\r\"R\n\x17RemoveExperimentRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"3\n\x18RemoveExperimentResponse\x12\x17\n\x0f\x61\x63knowledgement\x18\x01 \x01(\x08\"t\n\x12\x43ollaboratorStatus\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1d\n\x15is_experiment_running\x18\x02 \x01(\x08\x12\x31\n\x0evalid_duration\x18\x03 \x01(\x0b\x32\x19.google.protobuf.Duration\"3\n\x1f\x43ollaboratorHealthCheckResponse\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\"\xc2\x01\n\tEnvoyInfo\x12\x1e\n\nshard_info\x18\x01 \x01(\x0b\x32\n.ShardInfo\x12\x11\n\tis_online\x18\x02 \x01(\x08\x12\x1d\n\x15is_experiment_running\x18\x03 \x01(\x08\x12\x30\n\x0clast_updated\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x31\n\x0evalid_duration\x18\x05 \x01(\x0b\x32\x19.google.protobuf.Duration\"\x12\n\x10GetEnvoysRequest\"4\n\x11GetEnvoysResponse\x12\x1f\n\x0b\x65nvoy_infos\x18\x01 \x03(\x0b\x32\n.EnvoyInfo2\xb9\x05\n\x12\x46\x65\x64\x65rationDirector\x12\x37\n\x10\x41\x63knowledgeShard\x12\n.ShardInfo\x1a\x15.ShardAcknowledgement\"\x00\x12G\n\x0eWaitExperiment\x12\x16.WaitExperimentRequest\x1a\x17.WaitExperimentResponse\"\x00(\x01\x30\x01\x12\x43\n\x11GetExperimentData\x12\x19.GetExperimentDataRequest\x1a\x0f.ExperimentData\"\x00\x30\x01\x12\x42\n\x10SetNewExperiment\x12\x0f.ExperimentInfo\x1a\x19.SetNewExperimentResponse\"\x00(\x01\x12\x36\n\x0eGetDatasetInfo\x12\x16.GetDatasetInfoRequest\x1a\n.ShardInfo\"\x00\x12\x43\n\x0fGetTrainedModel\x12\x17.GetTrainedModelRequest\x1a\x15.TrainedModelResponse\"\x00\x12\x42\n\rStreamMetrics\x12\x15.StreamMetricsRequest\x1a\x16.StreamMetricsResponse\"\x00\x30\x01\x12M\n\x14RemoveExperimentData\x12\x18.RemoveExperimentRequest\x1a\x19.RemoveExperimentResponse\"\x00\x12R\n\x17\x43ollaboratorHealthCheck\x12\x13.CollaboratorStatus\x1a .CollaboratorHealthCheckResponse\"\x00\x12\x34\n\tGetEnvoys\x12\x11.GetEnvoysRequest\x1a\x12.GetEnvoysResponse\"\x00\x62\x06proto3' + serialized_pb=b'\n\x0e\x64irector.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1egoogle/protobuf/duration.proto\x1a\x10\x66\x65\x64\x65ration.proto\"\x1f\n\rRequestHeader\x12\x0e\n\x06sender\x18\x01 \x01(\t\"U\n\x08NodeInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0e\n\x06\x61\x64ress\x18\x02 \x01(\t\x12\x16\n\x0e\x63uda_available\x18\x03 \x01(\x08\x12\x13\n\x0bmemory_size\x18\x04 \x01(\r\"\x83\x01\n\tShardInfo\x12\x1c\n\tnode_info\x18\x01 \x01(\x0b\x32\t.NodeInfo\x12\x19\n\x11shard_description\x18\x02 \x01(\t\x12\x11\n\tn_samples\x18\x03 \x01(\x04\x12\x14\n\x0csample_shape\x18\x04 \x03(\t\x12\x14\n\x0ctarget_shape\x18\x05 \x03(\t\"(\n\x14ShardAcknowledgement\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\"2\n\x15WaitExperimentRequest\x12\x19\n\x11\x63ollaborator_name\x18\x01 \x01(\t\"1\n\x16WaitExperimentResponse\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\"N\n\x18GetExperimentDataRequest\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\x12\x19\n\x11\x63ollaborator_name\x18\x02 \x01(\t\"/\n\x0e\x45xperimentData\x12\x0c\n\x04size\x18\x01 \x01(\r\x12\x0f\n\x07npbytes\x18\x02 \x01(\x0c\"\xa6\x01\n\x0e\x45xperimentInfo\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x1a\n\x12\x63ollaborator_names\x18\x03 \x03(\t\x12(\n\x0f\x65xperiment_data\x18\x04 \x01(\x0b\x32\x0f.ExperimentData\x12 \n\x0bmodel_proto\x18\x05 \x01(\x0b\x32\x0b.ModelProto\"I\n\x18SetNewExperimentResponse\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\x12\x1b\n\x13tensorboard_address\x18\x02 \x01(\t\"\xb5\x01\n\x16GetTrainedModelRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\x12\x35\n\nmodel_type\x18\x03 \x01(\x0e\x32!.GetTrainedModelRequest.ModelType\"+\n\tModelType\x12\x0e\n\nBEST_MODEL\x10\x00\x12\x0e\n\nLAST_MODEL\x10\x01\"8\n\x14TrainedModelResponse\x12 \n\x0bmodel_proto\x18\x01 \x01(\x0b\x32\x0b.ModelProto\"7\n\x15GetDatasetInfoRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\"O\n\x14StreamMetricsRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"{\n\x15StreamMetricsResponse\x12\x15\n\rmetric_origin\x18\x01 \x01(\t\x12\x11\n\ttask_name\x18\x02 \x01(\t\x12\x13\n\x0bmetric_name\x18\x03 \x01(\t\x12\x14\n\x0cmetric_value\x18\x04 \x01(\x02\x12\r\n\x05round\x18\x05 \x01(\r\"R\n\x17RemoveExperimentRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"3\n\x18RemoveExperimentResponse\x12\x17\n\x0f\x61\x63knowledgement\x18\x01 \x01(\x08\"A\n\x12\x43ollaboratorStatus\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1d\n\x15is_experiment_running\x18\x02 \x01(\x08\"Y\n\x1f\x43ollaboratorHealthCheckResponse\x12\x36\n\x13health_check_period\x18\x01 \x01(\x0b\x32\x19.google.protobuf.Duration\"\xc2\x01\n\tEnvoyInfo\x12\x1e\n\nshard_info\x18\x01 \x01(\x0b\x32\n.ShardInfo\x12\x11\n\tis_online\x18\x02 \x01(\x08\x12\x1d\n\x15is_experiment_running\x18\x03 \x01(\x08\x12\x30\n\x0clast_updated\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x31\n\x0evalid_duration\x18\x05 \x01(\x0b\x32\x19.google.protobuf.Duration\"\x12\n\x10GetEnvoysRequest\"4\n\x11GetEnvoysResponse\x12\x1f\n\x0b\x65nvoy_infos\x18\x01 \x03(\x0b\x32\n.EnvoyInfo2\xb9\x05\n\x12\x46\x65\x64\x65rationDirector\x12\x37\n\x10\x41\x63knowledgeShard\x12\n.ShardInfo\x1a\x15.ShardAcknowledgement\"\x00\x12G\n\x0eWaitExperiment\x12\x16.WaitExperimentRequest\x1a\x17.WaitExperimentResponse\"\x00(\x01\x30\x01\x12\x43\n\x11GetExperimentData\x12\x19.GetExperimentDataRequest\x1a\x0f.ExperimentData\"\x00\x30\x01\x12\x42\n\x10SetNewExperiment\x12\x0f.ExperimentInfo\x1a\x19.SetNewExperimentResponse\"\x00(\x01\x12\x36\n\x0eGetDatasetInfo\x12\x16.GetDatasetInfoRequest\x1a\n.ShardInfo\"\x00\x12\x43\n\x0fGetTrainedModel\x12\x17.GetTrainedModelRequest\x1a\x15.TrainedModelResponse\"\x00\x12\x42\n\rStreamMetrics\x12\x15.StreamMetricsRequest\x1a\x16.StreamMetricsResponse\"\x00\x30\x01\x12M\n\x14RemoveExperimentData\x12\x18.RemoveExperimentRequest\x1a\x19.RemoveExperimentResponse\"\x00\x12R\n\x17\x43ollaboratorHealthCheck\x12\x13.CollaboratorStatus\x1a .CollaboratorHealthCheckResponse\"\x00\x12\x34\n\tGetEnvoys\x12\x11.GetEnvoysRequest\x1a\x12.GetEnvoysResponse\"\x00\x62\x06proto3' , dependencies=[google_dot_protobuf_dot_timestamp__pb2.DESCRIPTOR,google_dot_protobuf_dot_duration__pb2.DESCRIPTOR,federation__pb2.DESCRIPTOR,]) @@ -775,13 +775,6 @@ message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='valid_duration', full_name='CollaboratorStatus.valid_duration', index=2, - number=3, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -795,7 +788,7 @@ oneofs=[ ], serialized_start=1515, - serialized_end=1631, + serialized_end=1580, ) @@ -808,9 +801,9 @@ create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( - name='accepted', full_name='CollaboratorHealthCheckResponse.accepted', index=0, - number=1, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, + name='health_check_period', full_name='CollaboratorHealthCheckResponse.health_check_period', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), @@ -826,8 +819,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1633, - serialized_end=1684, + serialized_start=1582, + serialized_end=1671, ) @@ -886,8 +879,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1687, - serialized_end=1881, + serialized_start=1674, + serialized_end=1868, ) @@ -911,8 +904,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1883, - serialized_end=1901, + serialized_start=1870, + serialized_end=1888, ) @@ -943,8 +936,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1903, - serialized_end=1955, + serialized_start=1890, + serialized_end=1942, ) _SHARDINFO.fields_by_name['node_info'].message_type = _NODEINFO @@ -958,7 +951,7 @@ _GETDATASETINFOREQUEST.fields_by_name['header'].message_type = _REQUESTHEADER _STREAMMETRICSREQUEST.fields_by_name['header'].message_type = _REQUESTHEADER _REMOVEEXPERIMENTREQUEST.fields_by_name['header'].message_type = _REQUESTHEADER -_COLLABORATORSTATUS.fields_by_name['valid_duration'].message_type = google_dot_protobuf_dot_duration__pb2._DURATION +_COLLABORATORHEALTHCHECKRESPONSE.fields_by_name['health_check_period'].message_type = google_dot_protobuf_dot_duration__pb2._DURATION _ENVOYINFO.fields_by_name['shard_info'].message_type = _SHARDINFO _ENVOYINFO.fields_by_name['last_updated'].message_type = google_dot_protobuf_dot_timestamp__pb2._TIMESTAMP _ENVOYINFO.fields_by_name['valid_duration'].message_type = google_dot_protobuf_dot_duration__pb2._DURATION @@ -1150,8 +1143,8 @@ index=0, serialized_options=None, create_key=_descriptor._internal_create_key, - serialized_start=1958, - serialized_end=2655, + serialized_start=1945, + serialized_end=2642, methods=[ _descriptor.MethodDescriptor( name='AcknowledgeShard', diff --git a/openfl/transport/grpc/director_client.py b/openfl/transport/grpc/director_client.py index fa1b28bf0c..f1032fee49 100644 --- a/openfl/transport/grpc/director_client.py +++ b/openfl/transport/grpc/director_client.py @@ -96,16 +96,18 @@ def _get_node_info(self): """Generate a node info message.""" return director_pb2.NodeInfo(name=self.shard_name) - def send_health_check(self, collaborator_name, is_experiment_running, valid_duration): + def send_health_check(self, *, collaborator_name: str, is_experiment_running: bool) -> int: """Send envoy health check.""" status = director_pb2.CollaboratorStatus( name=collaborator_name, is_experiment_running=is_experiment_running, ) - status.valid_duration.seconds = valid_duration logger.debug(f'Sending health check status: {status}') - return self.stub.CollaboratorHealthCheck(status) + response = self.stub.CollaboratorHealthCheck(status) + health_check_period = response.health_check_period.seconds + + return health_check_period class DirectorClient: diff --git a/openfl/transport/grpc/director_server.py b/openfl/transport/grpc/director_server.py index 3fc955d312..f6ffd14e97 100644 --- a/openfl/transport/grpc/director_server.py +++ b/openfl/transport/grpc/director_server.py @@ -25,12 +25,12 @@ class DirectorGRPCServer(director_pb2_grpc.FederationDirectorServicer): def __init__(self, *, director_cls, tls: bool = True, root_certificate: str = None, private_key: str = None, certificate: str = None, - listen_addr='[::]', listen_port=50051, **kwargs) -> None: + listen_host='[::]', listen_port=50051, **kwargs) -> None: """Initialize a director object.""" # TODO: add working directory super().__init__() - self.listen_uri = f'{listen_addr}:{listen_port}' + self.listen_uri = f'{listen_host}:{listen_port}' self.tls = tls self.root_certificate = None self.private_key = None @@ -237,13 +237,14 @@ async def RemoveExperimentData(self, request, context): # NOQA:N802 async def CollaboratorHealthCheck(self, request, context): # NOQA:N802 """Accept health check from envoy.""" logger.debug(f'Request CollaboratorHealthCheck has got: {request}') - is_accepted = self.director.collaborator_health_check( + health_check_period = self.director.collaborator_health_check( collaborator_name=request.name, is_experiment_running=request.is_experiment_running, - valid_duration=request.valid_duration.seconds, ) + resp = director_pb2.CollaboratorHealthCheckResponse() + resp.health_check_period.seconds = health_check_period - return director_pb2.CollaboratorHealthCheckResponse(accepted=is_accepted) + return resp async def GetEnvoys(self, request, context): # NOQA:N802 """Get a status information about envoys.""" diff --git a/setup.py b/setup.py index 20531b565c..86af32a50b 100644 --- a/setup.py +++ b/setup.py @@ -44,6 +44,7 @@ 'openfl.plugins', 'openfl.plugins.interface_serializer', 'openfl.plugins.frameworks_adapters', + 'openfl.plugins.data_splitters', 'openfl-workspace', 'openfl-docker', 'openfl-tutorials', diff --git a/tests/github/interactive_api/experiment_runner.py b/tests/github/interactive_api/experiment_runner.py deleted file mode 100644 index ef1ef56a43..0000000000 --- a/tests/github/interactive_api/experiment_runner.py +++ /dev/null @@ -1,46 +0,0 @@ -import logging -import os -import shutil -import subprocess - -from openfl.utilities.logs import setup_loggers - -setup_loggers(logging.INFO) -logger = logging.getLogger(__name__) - - -def prepare_collaborator_workspace(col_dir, arch_path): - logger.info(f'Prepare collaborator directory: {col_dir}') - if os.path.exists(col_dir): - shutil.rmtree(col_dir) - os.makedirs(col_dir) - arch_col_path = shutil.copy(arch_path, col_dir) - shutil.unpack_archive(arch_col_path, col_dir) - logger.info('Collaborator directory prepared') - - -def run_aggregator(model_interface, fl_experiment): - logger.info('run_aggregator') - fl_experiment.start_experiment(model_interface) - logger.info('Aggregator stopped') - - -def run_experiment(col_data_paths, model_interface, arch_path, fl_experiment): - logger.info('Starting the experiment!') - for col_dir in col_data_paths: - prepare_collaborator_workspace(col_dir, arch_path) - - processes = [] - for col_name in col_data_paths: - logger.info(f'Starting collaborator: {col_name}') - p = subprocess.Popen( - f'fx collaborator start -n {col_name} -p plan/plan.yaml -d data.yaml'.split(' '), - cwd=os.path.join(os.getcwd(), col_name) - ) - processes.append(p) - - run_aggregator(model_interface, fl_experiment) - for p in processes: - p.terminate() - - logger.info('The experiment completed!') diff --git a/tests/github/interactive_api/experiments/pytorch_kvasir_unet/__init__.py b/tests/github/interactive_api/experiments/pytorch_kvasir_unet/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/github/interactive_api/experiments/pytorch_kvasir_unet/data_loader.py b/tests/github/interactive_api/experiments/pytorch_kvasir_unet/data_loader.py deleted file mode 100644 index e6a55f7d63..0000000000 --- a/tests/github/interactive_api/experiments/pytorch_kvasir_unet/data_loader.py +++ /dev/null @@ -1,25 +0,0 @@ -import os -from skimage import io -import numpy as np - -from openfl.utilities import validate_file_hash - - -def load_data(): - os.makedirs('data', exist_ok=True) - os.system("wget -nc 'https://datasets.simula.no/hyper-kvasir/hyper-kvasir-segmented-images.zip'" - " -O ./data/kvasir.zip") - zip_sha384 = 'e30d18a772c6520476e55b610a4db457237f151e' \ - '19182849d54b49ae24699881c1e18e0961f77642be900450ef8b22e7' - validate_file_hash('./data/kvasir.zip', zip_sha384) - os.system('unzip -n ./data/kvasir.zip -d ./data') - - -def read_data(image_path, mask_path): - """ - Read image and mask from disk. - """ - img = io.imread(image_path) - assert (img.shape[2] == 3) - mask = io.imread(mask_path) - return img, mask[:, :, 0].astype(np.uint8) diff --git a/tests/github/interactive_api/experiments/pytorch_kvasir_unet/dataset.py b/tests/github/interactive_api/experiments/pytorch_kvasir_unet/dataset.py deleted file mode 100644 index 19b5871070..0000000000 --- a/tests/github/interactive_api/experiments/pytorch_kvasir_unet/dataset.py +++ /dev/null @@ -1,110 +0,0 @@ -import os -import PIL -from torch.utils.data import Dataset, DataLoader -from torchvision import transforms as tsf - -from tests.github.interactive_api.experiments.pytorch_kvasir_unet.data_loader import read_data -from openfl.interface.interactive_api.experiment import DataInterface - - -class KvasirDataset(Dataset): - """ - Kvasir dataset contains 1000 images for all collaborators. - Args: - data_path: path to dataset on disk - collaborator_count: total number of collaborators - collaborator_num: number of current collaborator - is_validation: validation option - """ - - def __init__(self, images_path='./data/segmented-images/images/', - masks_path='./data/segmented-images/masks/', - validation_fraction=1 / 8, is_validation=False): - - self.images_path = images_path - self.masks_path = masks_path - self.images_names = [ - img_name - for img_name in sorted(os.listdir(self.images_path)) - if len(img_name) > 3 and img_name[-3:] == 'jpg' - ] - - assert (len(self.images_names) > 2), "Too few images" - - validation_size = max(1, int(len(self.images_names) * validation_fraction)) - - if is_validation: - self.images_names = self.images_names[-validation_size:] - else: - self.images_names = self.images_names[: -validation_size] - - # Prepare transforms - self.img_trans = tsf.Compose([ - tsf.ToPILImage(), - tsf.Resize((332, 332)), - tsf.ToTensor(), - tsf.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])]) - self.mask_trans = tsf.Compose([ - tsf.ToPILImage(), - tsf.Resize((332, 332), interpolation=PIL.Image.NEAREST), - tsf.ToTensor()]) - - def __getitem__(self, index): - name = self.images_names[index] - img, mask = read_data(self.images_path + name, self.masks_path + name) - img = self.img_trans(img).numpy() - mask = self.mask_trans(mask).numpy() - return img, mask - - def __len__(self): - return len(self.images_names) - - -class FedDataset(DataInterface): - - def _delayed_init(self, data_path='1,1'): - # With the next command the local dataset will be loaded on the collaborator node - # For this example we have the same dataset on the same path, and we will shard it - # So we use `data_path` information for this purpose. - self.rank, self.world_size = [int(part) for part in data_path.split(',')] - - validation_fraction = 1 / 8 - self.train_set = self.UserDatasetClass(validation_fraction=validation_fraction, - is_validation=False) - self.valid_set = self.UserDatasetClass(validation_fraction=validation_fraction, - is_validation=True) - - # Do the actual sharding - self._do_sharding(self.rank, self.world_size) - - def _do_sharding(self, rank, world_size): - # This method relies on the dataset's implementation - # i.e. coupled in a bad way - self.train_set.images_names = self.train_set.images_names[rank - 1:: world_size] - - def get_train_loader(self, **kwargs): - """ - Output of this method will be provided to tasks with optimizer in contract - """ - return DataLoader( - self.train_set, num_workers=8, batch_size=self.kwargs['train_bs'], shuffle=True - ) - - def get_valid_loader(self, **kwargs): - """ - Output of this method will be provided to tasks without optimizer in contract - """ - return DataLoader(self.valid_set, num_workers=8, batch_size=self.kwargs['valid_bs']) - - def get_train_data_size(self): - """ - Information for aggregation - """ - return len(self.train_set) - - def get_valid_data_size(self): - """ - Information for aggregation - """ - return len(self.valid_set) - diff --git a/tests/github/interactive_api/experiments/pytorch_kvasir_unet/experiment.py b/tests/github/interactive_api/experiments/pytorch_kvasir_unet/experiment.py deleted file mode 100644 index 975838e743..0000000000 --- a/tests/github/interactive_api/experiments/pytorch_kvasir_unet/experiment.py +++ /dev/null @@ -1,59 +0,0 @@ -import logging -from socket import getfqdn - -import torch.optim as optim - - -from tests.github.interactive_api.experiments.pytorch_kvasir_unet.model import UNet -from tests.github.interactive_api.experiments.pytorch_kvasir_unet.data_loader import load_data -from tests.github.interactive_api.experiments.pytorch_kvasir_unet.dataset import KvasirDataset, FedDataset -from openfl.interface.interactive_api.experiment import ModelInterface, FLExperiment -from openfl.interface.interactive_api.federation import Federation -from tests.github.interactive_api.experiments.pytorch_kvasir_unet.tasks import validate, task_interface -from tests.github.interactive_api.experiment_runner import run_experiment - -from copy import deepcopy - - -logger = logging.getLogger(__name__) - -model_unet = UNet() -optimizer_adam = optim.Adam(model_unet.parameters(), lr=1e-4) - -load_data() - -framework_adapter = 'openfl.plugins.frameworks_adapters.pytorch_adapter.FrameworkAdapterPlugin' -model_interface = ModelInterface(model=model_unet, optimizer=optimizer_adam, - framework_plugin=framework_adapter) - -# Save the initial model state -initial_model = deepcopy(model_unet) - -fed_dataset = FedDataset(KvasirDataset, train_bs=8, valid_bs=8) -federation = Federation(central_node_fqdn=getfqdn(), tls=False) - -# First number which is a collaborators rank is also passed as a cuda device identifier -col_data_paths = {'one': '1,2', - 'two': '2,2'} -federation.register_collaborators(col_data_paths=col_data_paths) -fl_experiment = FLExperiment(federation=federation) - -# If I use autoreload I got a pickling error -arch_path = fl_experiment.prepare_workspace_distribution( - model_provider=model_interface, - task_keeper=task_interface, - data_loader=fed_dataset, - rounds_to_train=7, - opt_treatment='CONTINUE_GLOBAL' -) - -run_experiment(col_data_paths, model_interface, arch_path, fl_experiment) - -best_model = fl_experiment.get_best_model() -fed_dataset._delayed_init() - -logger.info('Validating initial model') -validate(initial_model, fed_dataset.get_valid_loader(), 'cpu') - -logger.info('Validating trained model') -validate(best_model, fed_dataset.get_valid_loader(), 'cpu') diff --git a/tests/github/interactive_api/experiments/pytorch_kvasir_unet/model.py b/tests/github/interactive_api/experiments/pytorch_kvasir_unet/model.py deleted file mode 100644 index 53aa120bed..0000000000 --- a/tests/github/interactive_api/experiments/pytorch_kvasir_unet/model.py +++ /dev/null @@ -1,37 +0,0 @@ -import torch -import torch.nn as nn - -from tests.github.interactive_api.layers import double_conv, down, up - -""" -UNet model definition -""" - - -class UNet(nn.Module): - def __init__(self, n_channels=3, n_classes=1): - super().__init__() - self.inc = double_conv(n_channels, 64) - self.down1 = down(64, 128) - self.down2 = down(128, 256) - self.down3 = down(256, 512) - self.down4 = down(512, 1024) - self.up1 = up(1024, 512) - self.up2 = up(512, 256) - self.up3 = up(256, 128) - self.up4 = up(128, 64) - self.outc = nn.Conv2d(64, n_classes, 1) - - def forward(self, x): - x1 = self.inc(x) - x2 = self.down1(x1) - x3 = self.down2(x2) - x4 = self.down3(x3) - x5 = self.down4(x4) - x = self.up1(x5, x4) - x = self.up2(x, x3) - x = self.up3(x, x2) - x = self.up4(x, x1) - x = self.outc(x) - x = torch.sigmoid(x) - return x \ No newline at end of file diff --git a/tests/github/interactive_api/experiments/pytorch_kvasir_unet/settings.py b/tests/github/interactive_api/experiments/pytorch_kvasir_unet/settings.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/github/interactive_api/experiments/pytorch_kvasir_unet/tasks.py b/tests/github/interactive_api/experiments/pytorch_kvasir_unet/tasks.py deleted file mode 100644 index f895a23e7b..0000000000 --- a/tests/github/interactive_api/experiments/pytorch_kvasir_unet/tasks.py +++ /dev/null @@ -1,67 +0,0 @@ -import tqdm -import torch -import numpy as np - -from openfl.interface.interactive_api.experiment import TaskInterface -from tests.github.interactive_api.layers import soft_dice_loss, soft_dice_coef - - -task_interface = TaskInterface() - - -def function_defined_in_notebook(some_parameter): - print('I will cause problems') - print(f'Also I accept a parameter and it is {some_parameter}') - - -# We do not actually need to register additional kwargs, Just serialize them -@task_interface.add_kwargs(**{'some_parameter': 42}) -@task_interface.register_fl_task(model='unet_model', data_loader='train_loader', - device='device', optimizer='optimizer') -def train(unet_model, train_loader, optimizer, device, loss_fn=soft_dice_loss, some_parameter=None): - if not torch.cuda.is_available(): - device = 'cpu' - - function_defined_in_notebook(some_parameter) - - train_loader = tqdm.tqdm(train_loader, desc="train") - - unet_model.train() - unet_model.to(device) - - losses = [] - - for data, target in train_loader: - data, target = torch.tensor(data).to(device), torch.tensor( - target).to(device, dtype=torch.float32) - optimizer.zero_grad() - output = unet_model(data) - loss = loss_fn(output=output, target=target) - loss.backward() - optimizer.step() - losses.append(loss.detach().cpu().numpy()) - - return {'train_loss': np.mean(losses), } - - -@task_interface.register_fl_task(model='unet_model', data_loader='val_loader', device='device') -def validate(unet_model, val_loader, device): - unet_model.eval() - unet_model.to(device) - - val_loader = tqdm.tqdm(val_loader, desc="validate") - - val_score = 0 - total_samples = 0 - - with torch.no_grad(): - for data, target in val_loader: - samples = target.shape[0] - total_samples += samples - data, target = torch.tensor(data).to(device), \ - torch.tensor(target).to(device, dtype=torch.int64) - output = unet_model(data) - val = soft_dice_coef(output, target) - val_score += val.sum().cpu().numpy() - - return {'dice_coef': val_score / total_samples, } \ No newline at end of file diff --git a/tests/github/interactive_api/experiments/tensorflow_mnist/__init__.py b/tests/github/interactive_api/experiments/tensorflow_mnist/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/github/interactive_api/experiments/tensorflow_mnist/dataset.py b/tests/github/interactive_api/experiments/tensorflow_mnist/dataset.py deleted file mode 100644 index f100a0ebcb..0000000000 --- a/tests/github/interactive_api/experiments/tensorflow_mnist/dataset.py +++ /dev/null @@ -1,60 +0,0 @@ -import tensorflow as tf - -from openfl.interface.interactive_api.experiment import DataInterface - - -class FedDataset(DataInterface): - def __init__(self, X_train, y_train, X_valid, y_valid, **kwargs): - self.X_train = X_train - self.y_train = y_train - self.X_valid = X_valid - self.y_valid = y_valid - self.batch_size = kwargs['batch_size'] - self.kwargs = kwargs - self._setup_datasets() - - def _setup_datasets(self): - self.train_dataset = tf.data.Dataset.from_tensor_slices((self.X_train, self.y_train)) - self.train_dataset = self.train_dataset.shuffle(buffer_size=1024).batch(self.batch_size) - self.valid_dataset = tf.data.Dataset.from_tensor_slices((self.X_valid, self.y_valid)) - self.valid_dataset = self.valid_dataset.shuffle(buffer_size=1024).batch(self.batch_size) - - def _delayed_init(self, data_path='1,1'): - # With the next command the local dataset will be loaded on the collaborator node - # For this example we have the same dataset on the same path, and we will shard it - # So we use `data_path` information for this purpose. - self.rank, self.world_size = [int(part) for part in data_path.split(',')] - - # Do the actual sharding - self._do_sharding(self.rank, self.world_size) - - def _do_sharding(self, rank, world_size): - self.X_train = self.X_train[rank - 1:: world_size] - self.y_train = self.y_train[rank - 1:: world_size] - self.X_valid = self.X_valid[rank - 1:: world_size] - self.y_valid = self.y_valid[rank - 1:: world_size] - self._setup_datasets() - - def get_train_loader(self, **kwargs): - """ - Output of this method will be provided to tasks with optimizer in contract - """ - return self.train_dataset - - def get_valid_loader(self, **kwargs): - """ - Output of this method will be provided to tasks without optimizer in contract - """ - return self.valid_dataset - - def get_train_data_size(self): - """ - Information for aggregation - """ - return len(self.X_train) - - def get_valid_data_size(self): - """ - Information for aggregation - """ - return len(self.X_valid) diff --git a/tests/github/interactive_api/experiments/tensorflow_mnist/experiment.py b/tests/github/interactive_api/experiments/tensorflow_mnist/experiment.py deleted file mode 100644 index 6d75b5fd45..0000000000 --- a/tests/github/interactive_api/experiments/tensorflow_mnist/experiment.py +++ /dev/null @@ -1,72 +0,0 @@ -import logging - -import tensorflow as tf - -from openfl.interface.interactive_api.experiment import ModelInterface, FLExperiment -from openfl.interface.interactive_api.federation import Federation -from tests.github.interactive_api.experiment_runner import run_experiment -from tests.github.interactive_api.experiments.tensorflow_mnist.settings import model, optimizer, X_train, y_train, X_valid, y_valid, batch_size -from tests.github.interactive_api.experiments.tensorflow_mnist.dataset import FedDataset -from tests.github.interactive_api.experiments.tensorflow_mnist.tasks import train, validate, task_interface - -logger = logging.getLogger(__name__) - - -# Describing FL experiment -framework_adapter = 'openfl.plugins.frameworks_adapters.keras_adapter.FrameworkAdapterPlugin' -model_interface = ModelInterface( - model=model, optimizer=optimizer, framework_plugin=framework_adapter) - -# Register dataset -fed_dataset = FedDataset(X_train, y_train, X_valid, y_valid, batch_size=batch_size) - -# Perform model warm up -# The model warmup is necessary to initialize weights when using Tensorflow Gradient Tape - -train(model, fed_dataset.get_train_loader(), optimizer, 'cpu', warmup=True) - -#Make a copy of the model for later comparison -initial_model = tf.keras.models.clone_model(model) - - -# Prepare Federated Dataset for Serialization -# tf.data.DataSet does not serialize well with pickle. -# It will be recreated on the collaborators with the delayed init function -fed_dataset.train_dataset = None -fed_dataset.valid_dataset = None - - -# Start a federated learning experiment - -# Create a federation -# will determine fqdn by itself -federation = Federation(central_node_fqdn='localhost', tls=False) -# Datapath corresonds to 'RANK,WORLD_SIZE' -col_data_paths = { - 'one': '1,2', - 'two': '2,2' -} -federation.register_collaborators(col_data_paths=col_data_paths) - -# create an experimnet in federation -fl_experiment = FLExperiment(federation=federation) - -# If I use autoreload I got a pickling error -arch_path = fl_experiment.prepare_workspace_distribution( - model_provider=model_interface, - task_keeper=task_interface, - data_loader=fed_dataset, - rounds_to_train=7, - opt_treatment='CONTINUE_GLOBAL' -) - -run_experiment(col_data_paths, model_interface, arch_path, fl_experiment) - -best_model = fl_experiment.get_best_model() -fed_dataset._delayed_init() - -logger.info('Validating initial model') -validate(initial_model, fed_dataset.get_valid_loader(), 'cpu') - -logger.info('Validating trained model') -validate(best_model, fed_dataset.get_valid_loader(), 'cpu') diff --git a/tests/github/interactive_api/experiments/tensorflow_mnist/settings.py b/tests/github/interactive_api/experiments/tensorflow_mnist/settings.py deleted file mode 100644 index 9e530788d6..0000000000 --- a/tests/github/interactive_api/experiments/tensorflow_mnist/settings.py +++ /dev/null @@ -1,36 +0,0 @@ -import tensorflow as tf -from tensorflow import keras -from tensorflow.keras import layers -import numpy as np - - -# Describe the model and optimizer - -inputs = keras.Input(shape=(784,), name="digits") -x1 = layers.Dense(64, activation="relu")(inputs) -x2 = layers.Dense(64, activation="relu")(x1) -outputs = layers.Dense(10, name="predictions")(x2) -model = keras.Model(inputs=inputs, outputs=outputs) - -# Instantiate an optimizer. -optimizer = keras.optimizers.SGD(learning_rate=1e-3) -# Instantiate a loss function. -loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True) - -# Prepare the metrics. -train_acc_metric = keras.metrics.SparseCategoricalAccuracy() -val_acc_metric = keras.metrics.SparseCategoricalAccuracy() - - -# Prepare data - -# Prepare the training dataset. -batch_size = 64 -(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data() -x_train = np.reshape(x_train, (-1, 784)) -x_test = np.reshape(x_test, (-1, 784)) - -X_valid = x_train[-10000:] -y_valid = y_train[-10000:] -X_train = x_train[:-10000] -y_train = y_train[:-10000] diff --git a/tests/github/interactive_api/experiments/tensorflow_mnist/tasks.py b/tests/github/interactive_api/experiments/tensorflow_mnist/tasks.py deleted file mode 100644 index b71c146a27..0000000000 --- a/tests/github/interactive_api/experiments/tensorflow_mnist/tasks.py +++ /dev/null @@ -1,55 +0,0 @@ -from openfl.interface.interactive_api.experiment import TaskInterface -from tests.github.interactive_api.experiments.tensorflow_mnist.settings import loss_fn, \ - train_acc_metric, val_acc_metric - -task_interface = TaskInterface() - - -@task_interface.register_fl_task(model='model', data_loader='train_dataset', - device='device', optimizer='optimizer') -def train(model, train_dataset, optimizer, device, loss_fn=loss_fn, warmup=False): - import tensorflow as tf - - # Iterate over the batches of the dataset. - for step, (x_batch_train, y_batch_train) in enumerate(train_dataset): - with tf.GradientTape() as tape: - logits = model(x_batch_train, training=True) - loss_value = loss_fn(y_batch_train, logits) - grads = tape.gradient(loss_value, model.trainable_weights) - optimizer.apply_gradients(zip(grads, model.trainable_weights)) - - # Update training metric. - train_acc_metric.update_state(y_batch_train, logits) - - # Log every 200 batches. - if step % 200 == 0: - print( - "Training loss (for one batch) at step %d: %.4f" - % (step, float(loss_value)) - ) - print("Seen so far: %d samples" % ((step + 1) * 64)) - if warmup: - break - - # Display metrics at the end of each epoch. - train_acc = train_acc_metric.result() - print("Training acc over epoch: %.4f" % (float(train_acc),)) - - # Reset training metrics at the end of each epoch - train_acc_metric.reset_states() - - return {'train_acc': train_acc} - - -@task_interface.register_fl_task(model='model', data_loader='val_dataset', device='device') -def validate(model, val_dataset, device): - # Run a validation loop at the end of each epoch. - for x_batch_val, y_batch_val in val_dataset: - val_logits = model(x_batch_val, training=False) - # Update val metrics - val_acc_metric.update_state(y_batch_val, val_logits) - val_acc = val_acc_metric.result() - val_acc_metric.reset_states() - print("Validation acc: %.4f" % (float(val_acc),)) - - return {'validation_accuracy': val_acc} diff --git a/tests/github/interactive_api/single_file_experiments/pytorch_kvasir_unet.py b/tests/github/interactive_api/single_file_experiments/pytorch_kvasir_unet.py deleted file mode 100644 index 4bb3fb9765..0000000000 --- a/tests/github/interactive_api/single_file_experiments/pytorch_kvasir_unet.py +++ /dev/null @@ -1,389 +0,0 @@ -import logging - -import torch -import torch.nn as nn -import torch.optim as optim - -from tests.github.interactive_api.layers import soft_dice_loss, soft_dice_coef, double_conv, down, \ - up - -logger = logging.getLogger(__name__) - -""" -UNet model definition -""" - - -class UNet(nn.Module): - def __init__(self, n_channels=3, n_classes=1): - super().__init__() - self.inc = double_conv(n_channels, 64) - self.down1 = down(64, 128) - self.down2 = down(128, 256) - self.down3 = down(256, 512) - self.down4 = down(512, 1024) - self.up1 = up(1024, 512) - self.up2 = up(512, 256) - self.up3 = up(256, 128) - self.up4 = up(128, 64) - self.outc = nn.Conv2d(64, n_classes, 1) - - def forward(self, x): - x1 = self.inc(x) - x2 = self.down1(x1) - x3 = self.down2(x2) - x4 = self.down3(x3) - x5 = self.down4(x4) - x = self.up1(x5, x4) - x = self.up2(x, x3) - x = self.up3(x, x2) - x = self.up4(x, x1) - x = self.outc(x) - x = torch.sigmoid(x) - return x - - -model_unet = UNet() - -optimizer_adam = optim.Adam(model_unet.parameters(), lr=1e-4) - -import os -import PIL -from torch.utils.data import Dataset, DataLoader -from torchvision import transforms as tsf -from skimage import io -from openfl.utilities import validate_file_hash - -os.makedirs('data', exist_ok=True) -os.system( - "wget -nc 'https://datasets.simula.no/hyper-kvasir/hyper-kvasir-segmented-images.zip' -O ./data/kvasir.zip") -ZIP_SHA384 = 'e30d18a772c6520476e55b610a4db457237f151e' \ - '19182849d54b49ae24699881c1e18e0961f77642be900450ef8b22e7' -validate_file_hash('./data/kvasir.zip', ZIP_SHA384) -os.system('unzip -n ./data/kvasir.zip -d ./data') - -DATA_PATH = './data/segmented-images/' -import numpy as np - - -def read_data(image_path, mask_path): - """ - Read image and mask from disk. - """ - img = io.imread(image_path) - assert (img.shape[2] == 3) - mask = io.imread(mask_path) - return (img, mask[:, :, 0].astype(np.uint8)) - - -class KvasirDataset(Dataset): - """ - Kvasir dataset contains 1000 images for all collaborators. - Args: - data_path: path to dataset on disk - collaborator_count: total number of collaborators - collaborator_num: number of current collaborator - is_validation: validation option - """ - - def __init__(self, images_path='./data/segmented-images/images/', - masks_path='./data/segmented-images/masks/', - validation_fraction=1 / 8, is_validation=False): - - self.images_path = images_path - self.masks_path = masks_path - self.images_names = [ - img_name - for img_name in sorted(os.listdir(self.images_path)) - if len(img_name) > 3 and img_name[-3:] == 'jpg' - ] - - assert (len(self.images_names) > 2), "Too few images" - - validation_size = max(1, int(len(self.images_names) * validation_fraction)) - - if is_validation: - self.images_names = self.images_names[-validation_size:] - else: - self.images_names = self.images_names[: -validation_size] - - # Prepare transforms - self.img_trans = tsf.Compose([ - tsf.ToPILImage(), - tsf.Resize((332, 332)), - tsf.ToTensor(), - tsf.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])]) - self.mask_trans = tsf.Compose([ - tsf.ToPILImage(), - tsf.Resize((332, 332), interpolation=PIL.Image.NEAREST), - tsf.ToTensor()]) - - def __getitem__(self, index): - name = self.images_names[index] - img, mask = read_data(self.images_path + name, self.masks_path + name) - img = self.img_trans(img).numpy() - mask = self.mask_trans(mask).numpy() - return img, mask - - def __len__(self): - return len(self.images_names) - - -def function_defined_in_notebook(): - print('I will cause problems') - - -def train(unet_model, train_loader, optimizer, device, loss_fn=soft_dice_loss): - function_defined_in_notebook() - - unet_model.train() - unet_model.to(device) - - losses = [] - - for data, target in train_loader: - data, target = torch.tensor(data).to(device), torch.tensor( - target).to(device, dtype=torch.float32) - optimizer.zero_grad() - output = unet_model(data) - loss = loss_fn(output=output, target=target) - loss.backward() - optimizer.step() - losses.append(loss.detach().cpu().numpy()) - - return {'train_loss': np.mean(losses), } - - -def validate(unet_model, val_loader, device): - unet_model.eval() - unet_model.to(device) - - val_score = 0 - total_samples = 0 - - with torch.no_grad(): - for data, target in val_loader: - samples = target.shape[0] - total_samples += samples - data, target = torch.tensor(data).to(device), \ - torch.tensor(target).to(device, dtype=torch.int64) - output = unet_model(data) - val = soft_dice_coef(output, target) - val_score += val.sum().cpu().numpy() - - return {'dice_coef': val_score / total_samples, } - - -from openfl.interface.interactive_api.experiment import TaskInterface, DataInterface, \ - ModelInterface, FLExperiment - -from copy import deepcopy - -framework_adapter = 'openfl.plugins.frameworks_adapters.pytorch_adapter.FrameworkAdapterPlugin' -model_interface = ModelInterface(model=model_unet, optimizer=optimizer_adam, - framework_plugin=framework_adapter) - -# Save the initial model state -initial_model = deepcopy(model_unet) - - -class UserDataset: - def __init__(self, path_to_local_data): - print(f'User Dataset initialized with {path_to_local_data}') - - -class OpenflMixin: - def _delayed_init(self): - raise NotImplementedError - - -class FedDataset(OpenflMixin): - def __init__(self, UserDataset): - self.user_dataset_class = UserDataset - print('We implement all abstract methods from mixin in this class') - - def _delayed_init(self, data_path): - print('This method is called on the collaborator node') - dataset_obj = self.user_dataset_class(data_path) - - -fed_dataset = FedDataset(UserDataset) -fed_dataset._delayed_init('data path on the collaborator node') - - -class FedDataset(DataInterface): - def __init__(self, UserDatasetClass, **kwargs): - self.UserDatasetClass = UserDatasetClass - self.kwargs = kwargs - - def _delayed_init(self, data_path='1,1'): - # With the next command the local dataset will be loaded on the collaborator node - # For this example we have the same dataset on the same path, and we will shard it - # So we use `data_path` information for this purpose. - self.rank, self.world_size = [int(part) for part in data_path.split(',')] - - validation_fraction = 1 / 8 - self.train_set = self.UserDatasetClass(validation_fraction=validation_fraction, - is_validation=False) - self.valid_set = self.UserDatasetClass(validation_fraction=validation_fraction, - is_validation=True) - - # Do the actual sharding - self._do_sharding(self.rank, self.world_size) - - def _do_sharding(self, rank, world_size): - # This method relies on the dataset's implementation - # i.e. coupled in a bad way - self.train_set.images_names = self.train_set.images_names[rank - 1:: world_size] - - def get_train_loader(self, **kwargs): - """ - Output of this method will be provided to tasks with optimizer in contract - """ - return DataLoader( - self.train_set, num_workers=8, batch_size=self.kwargs['train_bs'], shuffle=True - ) - - def get_valid_loader(self, **kwargs): - """ - Output of this method will be provided to tasks without optimizer in contract - """ - return DataLoader(self.valid_set, num_workers=8, batch_size=self.kwargs['valid_bs']) - - def get_train_data_size(self): - """ - Information for aggregation - """ - return len(self.train_set) - - def get_valid_data_size(self): - """ - Information for aggregation - """ - return len(self.valid_set) - - -fed_dataset = FedDataset(KvasirDataset, train_bs=8, valid_bs=8) - -TI = TaskInterface() - -import tqdm - - -def function_defined_in_notebook(some_parameter): - print('I will cause problems') - print(f'Also I accept a parameter and it is {some_parameter}') - - -# We do not actually need to register additional kwargs, Just serialize them -@TI.add_kwargs(**{'some_parameter': 42}) -@TI.register_fl_task(model='unet_model', data_loader='train_loader', - device='device', optimizer='optimizer') -def train(unet_model, train_loader, optimizer, device, loss_fn=soft_dice_loss, some_parameter=None): - if not torch.cuda.is_available(): - device = 'cpu' - - function_defined_in_notebook(some_parameter) - - train_loader = tqdm.tqdm(train_loader, desc="train") - - unet_model.train() - unet_model.to(device) - - losses = [] - - for data, target in train_loader: - data, target = torch.tensor(data).to(device), torch.tensor( - target).to(device, dtype=torch.float32) - optimizer.zero_grad() - output = unet_model(data) - loss = loss_fn(output=output, target=target) - loss.backward() - optimizer.step() - losses.append(loss.detach().cpu().numpy()) - - return {'train_loss': np.mean(losses), } - - -@TI.register_fl_task(model='unet_model', data_loader='val_loader', device='device') -def validate(unet_model, val_loader, device): - unet_model.eval() - unet_model.to(device) - - val_loader = tqdm.tqdm(val_loader, desc="validate") - - val_score = 0 - total_samples = 0 - - with torch.no_grad(): - for data, target in val_loader: - samples = target.shape[0] - total_samples += samples - data, target = torch.tensor(data).to(device), \ - torch.tensor(target).to(device, dtype=torch.int64) - output = unet_model(data) - val = soft_dice_coef(output, target) - val_score += val.sum().cpu().numpy() - - return {'dice_coef': val_score / total_samples, } - - # Create a federation - - -from openfl.interface.interactive_api.federation import Federation - -# 1) Run with TLS disabled (trusted environment) -# will determine fqdn by itself -from socket import getfqdn - -federation = Federation(central_node_fqdn=getfqdn(), tls=False) -# First number which is a collaborators rank is also passed as a cuda device identifier -col_data_paths = {'one': '1,2', - 'two': '2,2'} -federation.register_collaborators(col_data_paths=col_data_paths) - -# -------------------------------------------------------------------------------------------------------------------- -# 2) Run with aggregator-collaborator mTLS -# If the user wants to enable mTLS their must provide CA root chain, and signed key pair to the federation interface -# cert_chain = 'cert/cert_chain.crt' -# agg_certificate = 'cert/agg_certificate.crt' -# agg_private_key = 'cert/agg_private.key' - -# federation = Federation(central_node_fqdn=getfqdn(), tls=False, -# cert_chain=cert_chain, agg_certificate=agg_certificate, agg_private_key=agg_private_key) -# col_data_paths = {'one': '1,1',} -# federation.register_collaborators(col_data_paths=col_data_paths) - -# create an experimnet in federation -fl_experiment = FLExperiment(federation=federation, ) - -# If I use autoreload I got a pickling error - -# # The following command zips the workspace and python requirements to be transfered to collaborator nodes -# fl_experiment.prepare_workspace_distribution(model_provider=MI, task_keeper=TI, data_loader=fed_dataset, rounds_to_train=7, \ -# opt_treatment='CONTINUE_GLOBAL') -# # # This command starts the aggregator server -# # fl_experiment.start_experiment(model_provider=MI) - - -# If I use autoreload I got a pickling error -arch_path = fl_experiment.prepare_workspace_distribution( - model_provider=model_interface, - task_keeper=TI, - data_loader=fed_dataset, - rounds_to_train=7, - opt_treatment='CONTINUE_GLOBAL' -) - -from tests.github.interactive_api.experiment_runner import run_experiment - -run_experiment(col_data_paths, model_interface, arch_path, fl_experiment) - -best_model = fl_experiment.get_best_model() -fed_dataset._delayed_init() - -logger.info('Validating initial model') -validate(initial_model, fed_dataset.get_valid_loader(), 'cpu') - -logger.info('Validating trained model') -validate(best_model, fed_dataset.get_valid_loader(), 'cpu') diff --git a/tests/github/interactive_api/single_file_experiments/requirements.txt b/tests/github/interactive_api/single_file_experiments/requirements.txt deleted file mode 100644 index 9f5632775a..0000000000 --- a/tests/github/interactive_api/single_file_experiments/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -tensorflow==2.3.1 -torch==1.7.1 -torchvision==0.8.2 -scikit-image==0.17.2 \ No newline at end of file diff --git a/tests/github/interactive_api/single_file_experiments/tensorflow_mnist.py b/tests/github/interactive_api/single_file_experiments/tensorflow_mnist.py deleted file mode 100644 index 6a7901aa08..0000000000 --- a/tests/github/interactive_api/single_file_experiments/tensorflow_mnist.py +++ /dev/null @@ -1,218 +0,0 @@ -# Describe the model and optimizer -import logging - -import tensorflow as tf -from tensorflow import keras -from tensorflow.keras import layers -import numpy as np - -logger = logging.getLogger(__name__) - -inputs = keras.Input(shape=(784,), name="digits") -x1 = layers.Dense(64, activation="relu")(inputs) -x2 = layers.Dense(64, activation="relu")(x1) -outputs = layers.Dense(10, name="predictions")(x2) -model = keras.Model(inputs=inputs, outputs=outputs) - -# Instantiate an optimizer. -optimizer = keras.optimizers.SGD(learning_rate=1e-3) -# Instantiate a loss function. -loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True) - -# Prepare the metrics. -train_acc_metric = keras.metrics.SparseCategoricalAccuracy() -val_acc_metric = keras.metrics.SparseCategoricalAccuracy() - -# Prepare data - -# Prepare the training dataset. -batch_size = 64 -(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data() -x_train = np.reshape(x_train, (-1, 784)) -x_test = np.reshape(x_test, (-1, 784)) - -X_valid = x_train[-10000:] -y_valid = y_train[-10000:] -X_train = x_train[:-10000] -y_train = y_train[:-10000] - -# Describing FL experiment - -from openfl.interface.interactive_api.experiment import TaskInterface, DataInterface, \ - ModelInterface, FLExperiment - -framework_adapter = 'openfl.plugins.frameworks_adapters.keras_adapter.FrameworkAdapterPlugin' -model_interface = ModelInterface(model=model, optimizer=optimizer, - framework_plugin=framework_adapter) - - -# Register dataset - -class FedDataset(DataInterface): - def __init__(self, x_train, y_train, x_valid, y_valid, **kwargs): - self.X_train = X_train - self.y_train = y_train - self.X_valid = X_valid - self.y_valid = y_valid - self.batch_size = kwargs['batch_size'] - self.kwargs = kwargs - self._setup_datasets() - - def _setup_datasets(self): - self.train_dataset = tf.data.Dataset.from_tensor_slices((self.X_train, self.y_train)) - self.train_dataset = self.train_dataset.shuffle(buffer_size=1024).batch(self.batch_size) - self.valid_dataset = tf.data.Dataset.from_tensor_slices((self.X_valid, self.y_valid)) - self.valid_dataset = self.valid_dataset.shuffle(buffer_size=1024).batch(self.batch_size) - - def _delayed_init(self, data_path='1,1'): - # With the next command the local dataset will be loaded on the collaborator node - # For this example we have the same dataset on the same path, and we will shard it - # So we use `data_path` information for this purpose. - self.rank, self.world_size = [int(part) for part in data_path.split(',')] - - # Do the actual sharding - self._do_sharding(self.rank, self.world_size) - - def _do_sharding(self, rank, world_size): - self.X_train = self.X_train[rank - 1:: world_size] - self.y_train = self.y_train[rank - 1:: world_size] - self.X_valid = self.X_valid[rank - 1:: world_size] - self.y_valid = self.y_valid[rank - 1:: world_size] - self._setup_datasets() - - def get_train_loader(self, **kwargs): - """ - Output of this method will be provided to tasks with optimizer in contract - """ - return self.train_dataset - - def get_valid_loader(self, **kwargs): - """ - Output of this method will be provided to tasks without optimizer in contract - """ - return self.valid_dataset - - def get_train_data_size(self): - """ - Information for aggregation - """ - return len(self.X_train) - - def get_valid_data_size(self): - """ - Information for aggregation - """ - return len(self.X_valid) - - -fed_dataset = FedDataset(X_train, y_train, X_valid, y_valid, batch_size=batch_size) - -# Register tasks - -TI = TaskInterface() - -import time - - -@TI.register_fl_task(model='model', data_loader='train_dataset', - device='device', optimizer='optimizer') -def train(model, train_dataset, optimizer, device, loss_fn=loss_fn, warmup=False): - start_time = time.time() - - # Iterate over the batches of the dataset. - for step, (x_batch_train, y_batch_train) in enumerate(train_dataset): - with tf.GradientTape() as tape: - logits = model(x_batch_train, training=True) - loss_value = loss_fn(y_batch_train, logits) - grads = tape.gradient(loss_value, model.trainable_weights) - optimizer.apply_gradients(zip(grads, model.trainable_weights)) - - # Update training metric. - train_acc_metric.update_state(y_batch_train, logits) - - # Log every 200 batches. - if step % 200 == 0: - print( - "Training loss (for one batch) at step %d: %.4f" - % (step, float(loss_value)) - ) - print("Seen so far: %d samples" % ((step + 1) * 64)) - if warmup: - break - - # Display metrics at the end of each epoch. - train_acc = train_acc_metric.result() - print("Training acc over epoch: %.4f" % (float(train_acc),)) - - # Reset training metrics at the end of each epoch - train_acc_metric.reset_states() - - return {'train_acc': train_acc, } - - -@TI.register_fl_task(model='model', data_loader='val_dataset', device='device') -def validate(model, val_dataset, device): - # Run a validation loop at the end of each epoch. - for x_batch_val, y_batch_val in val_dataset: - val_logits = model(x_batch_val, training=False) - # Update val metrics - val_acc_metric.update_state(y_batch_val, val_logits) - val_acc = val_acc_metric.result() - val_acc_metric.reset_states() - print("Validation acc: %.4f" % (float(val_acc),)) - - return {'validation_accuracy': val_acc, } - - -# Perform model warm up -# The model warmup is necessary to initialize weights when using Tensorflow Gradient Tape - -train(model, fed_dataset.get_train_loader(), optimizer, 'cpu', warmup=True) - -# Make a copy of the model for later comparison -initial_model = tf.keras.models.clone_model(model) - -# Prepare Federated Dataset for Serialization -# tf.data.DataSet does not serialize well with pickle. It will be recreated on the -# collaborators with the delayed init function -fed_dataset.train_dataset = None -fed_dataset.valid_dataset = None - -# Start a federated learning experiment - -# Create a federation -from openfl.interface.interactive_api.federation import Federation - -# will determine fqdn by itself -federation = Federation(central_node_fqdn='localhost', tls=False) -# Datapath corresonds to 'RANK,WORLD_SIZE' -col_data_paths = {'one': '1,2', - 'two': '2,2'} -federation.register_collaborators(col_data_paths=col_data_paths) - -# create an experimnet in federation -fl_experiment = FLExperiment(federation=federation) - -# If I use autoreload I got a pickling error -arch_path = fl_experiment.prepare_workspace_distribution( - model_provider=model_interface, - task_keeper=TI, - data_loader=fed_dataset, - rounds_to_train=7, - opt_treatment='CONTINUE_GLOBAL' -) - - -from tests.github.interactive_api.experiment_runner import run_experiment - - -run_experiment(col_data_paths, model_interface, arch_path, fl_experiment) - -best_model = fl_experiment.get_best_model() -fed_dataset._delayed_init() - -logger.info('Validating initial model') -validate(initial_model, fed_dataset.get_valid_loader(), 'cpu') - -logger.info('Validating trained model') -validate(best_model, fed_dataset.get_valid_loader(), 'cpu') diff --git a/tests/github/interactive_api_director/experiment_runner.py b/tests/github/interactive_api_director/experiment_runner.py index a789146ec7..f3685e018e 100644 --- a/tests/github/interactive_api_director/experiment_runner.py +++ b/tests/github/interactive_api_director/experiment_runner.py @@ -59,31 +59,45 @@ def run_experiment(col_data_paths, model_interface, arch_path, fl_experiment): logger.info('The experiment completed!') -def create_director(director_path, recreate): - logger.info('Creating the director!') +def create_director(director_path, recreate, config): + logger.info(f'Creating the director in {director_path}!') if os.path.exists(director_path): if not recreate: return shutil.rmtree(director_path) - os.makedirs(director_path) - # TODO: copy data to director directory (certificates) + subprocess.Popen( + f'fx director create-workspace -p {director_path}', + shell=True + ).wait() + shutil.copy(config, director_path) -def create_envoy(col_path, recreate): +def create_envoy(col_path, recreate, shard_config, shard_descriptor): logger.info(f'Creating the envoy in {col_path}!') if os.path.exists(col_path): if not recreate: return shutil.rmtree(col_path) - os.makedirs(col_path) - # TODO: copy data to envoy directory (certificates) - - -def create_federation(director_path: str, collaborator_paths: typing.Iterable[str], recreate=False): + subprocess.Popen( + f'fx envoy create-workspace -p {col_path}', + shell=True + ).wait() + shutil.copy(shard_config, col_path) + shutil.copy(shard_descriptor, col_path) + + +def create_federation( + director_path: str, + collaborator_paths: typing.Iterable[str], + director_config, + shard_config, + shard_descriptor, + recreate=False +): logger.info('Creating the federation!') - create_director(director_path, recreate) + create_director(director_path, recreate, director_config) for col_path in collaborator_paths: - create_envoy(col_path, recreate) + create_envoy(col_path, recreate, shard_config, shard_descriptor) # TODO: create mTLS logger.info('Federation was created') @@ -100,7 +114,8 @@ def run_federation(shards: typing.Dict[str, Shard], director_path: str): logger.info('Starting the experiment!') running_processes = [] p = subprocess.Popen( - f"fx director start", shell=True, + f"fx director start --disable-tls", + shell=True, cwd=os.path.join(director_path) ) sleep(2) diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/dataset.py b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/dataset.py index 19b5871070..61cb184b6f 100644 --- a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/dataset.py +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/dataset.py @@ -3,7 +3,7 @@ from torch.utils.data import Dataset, DataLoader from torchvision import transforms as tsf -from tests.github.interactive_api.experiments.pytorch_kvasir_unet.data_loader import read_data +from tests.github.interactive_api_director.experiments.pytorch_kvasir_unet.data_loader import read_data from openfl.interface.interactive_api.experiment import DataInterface diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/director/director_config.yaml b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/director/director_config.yaml new file mode 100644 index 0000000000..9b45b60869 --- /dev/null +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/director/director_config.yaml @@ -0,0 +1,4 @@ +settings: + listen_ip: localhost + sample_shape: ['300', '400', '3'] + target_shape: ['300', '400'] \ No newline at end of file diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/director/start_director.sh b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/director/start_director.sh new file mode 100644 index 0000000000..5806a6cc0a --- /dev/null +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/director/start_director.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +fx director start --disable-tls -c director_config.yaml \ No newline at end of file diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/kvasir_shard_descriptor.py b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/kvasir_shard_descriptor.py new file mode 100644 index 0000000000..8cf2debada --- /dev/null +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/kvasir_shard_descriptor.py @@ -0,0 +1,139 @@ +# Copyright (C) 2020-2021 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""Kvasir shard descriptor.""" + + +import os +from pathlib import Path +from typing import List + +import numpy as np +from PIL import Image + +from openfl.interface.interactive_api.shard_descriptor import ShardDescriptor +from openfl.utilities import validate_file_hash + + +class KvasirShardDescriptor(ShardDescriptor): + """Shard descriptor class.""" + + def __init__(self, data_folder: str = 'kvasir_data', + rank=1, + worldsize=1, + enforce_image_hw: List[int] = None) -> None: + """Initialize KvasirShardDescriptor.""" + super().__init__() + + self.data_folder = Path.cwd() / data_folder + self.download_data(self.data_folder) + + # Settings for resizing data + self.enforce_image_hw = None + if enforce_image_hw is not None: + self.enforce_image_hw = tuple(enforce_image_hw) + # Settings for sharding the dataset + self.rank = rank + self.worldsize = worldsize + + self.images_path = self.data_folder / 'segmented-images' / 'images' + self.masks_path = self.data_folder / 'segmented-images' / 'masks' + + self.images_names = [ + img_name + for img_name in sorted(os.listdir(self.images_path)) + if Path(img_name).suffix == '.jpg' + ] + # Sharding + self.images_names = self.images_names[self.rank - 1::self.worldsize] + + # Calculating data and target shapes + sample, target = self[0] + self._sample_shape = [str(dim) for dim in sample.shape] + self._target_shape = [str(dim) for dim in target.shape] + + @staticmethod + def download_data(data_folder): + """Download data.""" + zip_file_path = data_folder / 'kvasir.zip' + os.makedirs(data_folder, exist_ok=True) + os.system('wget -nc' + " 'https://datasets.simula.no/hyper-kvasir/hyper-kvasir-segmented-images.zip'" + f' -O {zip_file_path.relative_to(Path.cwd())}') + zip_sha384 = ('e30d18a772c6520476e55b610a4db457237f151e' + '19182849d54b49ae24699881c1e18e0961f77642be900450ef8b22e7') + validate_file_hash(zip_file_path, zip_sha384) + os.system(f'unzip -n {zip_file_path.relative_to(Path.cwd())}' + f' -d {data_folder.relative_to(Path.cwd())}') + + def __getitem__(self, index): + """Return a item by the index.""" + name = self.images_names[index] + # Reading data + img = Image.open(self.images_path / name) + mask = Image.open(self.masks_path / name) + if self.enforce_image_hw is not None: + # If we need to resize data + # PIL accepts (w,h) tuple, not (h,w) + img = img.resize(self.enforce_image_hw[::-1]) + mask = mask.resize(self.enforce_image_hw[::-1]) + img = np.asarray(img) + mask = np.asarray(mask) + assert img.shape[2] == 3 + + return img, mask[:, :, 0].astype(np.uint8) + + def __len__(self): + """Return the len of the dataset.""" + return len(self.images_names) + + @property + def sample_shape(self): + """Return the sample shape info.""" + return self._sample_shape + + @property + def target_shape(self): + """Return the target shape info.""" + return self._target_shape + + @property + def dataset_description(self) -> str: + """Return the dataset description.""" + return (f'Kvasir dataset, shard number {self.rank}' + f' out of {self.worldsize}') + + +if __name__ == '__main__': + from openfl.interface.cli import setup_logging + setup_logging() + + data_folder = 'data' + rank = 1 + worldsize = 100 + enforce_image_hw = [300, 400] + + kvasir_sd = KvasirShardDescriptor( + data_folder, + rank=rank, + worldsize=worldsize, + enforce_image_hw=enforce_image_hw) + + print(kvasir_sd.dataset_description) + print(kvasir_sd.sample_shape, kvasir_sd.target_shape) + + from openfl.component.envoy.envoy import Envoy + + shard_name = 'one' + director_host = 'localhost' + director_port = 50051 + + + keeper = Envoy( + shard_name=shard_name, + director_host=director_host, + director_port=director_port, + shard_descriptor=kvasir_sd, + tls=False + ) + + keeper.start() diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/sd_requirements.txt b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/sd_requirements.txt new file mode 100644 index 0000000000..50acd55c94 --- /dev/null +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/sd_requirements.txt @@ -0,0 +1,3 @@ +numpy +pillow +scikit-image \ No newline at end of file diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/shard_config.yaml b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/shard_config.yaml new file mode 100644 index 0000000000..80a864f944 --- /dev/null +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/shard_config.yaml @@ -0,0 +1,6 @@ +template: kvasir_shard_descriptor.KvasirShardDescriptor +params: + data_folder: kvasir_data + rank: 1 + worldsize: 90 + enforce_image_hw: [300, 400] \ No newline at end of file diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/start_envoy.sh b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/start_envoy.sh new file mode 100644 index 0000000000..222d3988e0 --- /dev/null +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/start_envoy.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +fx envoy start -n env_one --disable-tls --shard-config-path shard_config.yaml -dh localhost -dp 50051 \ No newline at end of file diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/experiment.py b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/experiment.py index cc20d7bfb1..8483a8c741 100644 --- a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/experiment.py +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/experiment.py @@ -1,59 +1,135 @@ -import logging -from socket import getfqdn +from copy import deepcopy +import numpy as np +import PIL import torch.optim as optim +from tests.github.interactive_api_director.experiments.pytorch_kvasir_unet.model import UNet +from tests.github.interactive_api_director.experiments.pytorch_kvasir_unet.tasks import task_interface +from tests.github.interactive_api_director.experiments.pytorch_kvasir_unet.tasks import validate +from torch.utils.data import DataLoader +from torch.utils.data import Dataset +from torch.utils.data import SubsetRandomSampler +from torchvision import transforms as tsf - -from tests.github.interactive_api.experiments.pytorch_kvasir_unet.model import UNet -from tests.github.interactive_api.experiments.pytorch_kvasir_unet.data_loader import load_data -from tests.github.interactive_api.experiments.pytorch_kvasir_unet.dataset import KvasirDataset, FedDataset -from openfl.interface.interactive_api.experiment import ModelInterface, FLExperiment +from openfl.interface.interactive_api.experiment import DataInterface +from openfl.interface.interactive_api.experiment import FLExperiment +from openfl.interface.interactive_api.experiment import ModelInterface from openfl.interface.interactive_api.federation import Federation -from tests.github.interactive_api.experiments.pytorch_kvasir_unet.tasks import validate, task_interface -# from tests.github.interactive_api.experiment_runner import run_experiment -from copy import deepcopy +federation = Federation(client_id='frontend', director_node_fqdn='localhost', director_port=50051, tls=False) + +shard_registry = federation.get_shard_registry() +shard_registry + +dummy_shard_desc = federation.get_dummy_shard_descriptor(size=10) +sample, target = dummy_shard_desc[0] + + +# Now you can implement you data loaders using dummy_shard_desc +class KvasirSD(DataInterface, Dataset): + + def __init__(self, train_bs, valid_bs, validation_fraction=1/8, **kwargs): + super().__init__(**kwargs) + + self.validation_fraction = validation_fraction + + # Prepare transforms + self.img_trans = tsf.Compose([ + tsf.ToPILImage(), + tsf.Resize((332, 332)), + tsf.ToTensor(), + tsf.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])]) + self.mask_trans = tsf.Compose([ + tsf.ToPILImage(), + tsf.Resize((332, 332), interpolation=PIL.Image.NEAREST), + tsf.ToTensor()]) + self.train_bs = train_bs + self.valid_bs = valid_bs + + @property + def shard_descriptor(self): + return self._shard_descriptor + + @shard_descriptor.setter + def shard_descriptor(self, shard_descriptor): + """ + Describe per-collaborator procedures or sharding. -logger = logging.getLogger(__name__) + This method will be called during a collaborator initialization. + Local shard_descriptor will be set by Envoy. + """ + self._shard_descriptor = shard_descriptor + + validation_size = max(1, int(len(self.shard_descriptor) * self.validation_fraction)) + + self.train_indeces = np.arange(len(self.shard_descriptor) - validation_size) + self.val_indeces = np.arange(len(self.shard_descriptor) - validation_size, len(self.shard_descriptor)) + + def __getitem__(self, index): + img, mask = self.shard_descriptor[index] + img = self.img_trans(img).numpy() + mask = self.mask_trans(mask).numpy() + return img, mask + + def __len__(self): + return len(self.shard_descriptor) + + + def get_train_loader(self): + """ + Output of this method will be provided to tasks with optimizer in contract + """ + train_sampler = SubsetRandomSampler(self.train_indeces) + return DataLoader( + self, num_workers=8, batch_size=self.train_bs, sampler=train_sampler + ) + + def get_valid_loader(self): + """ + Output of this method will be provided to tasks without optimizer in contract + """ + val_sampler = SubsetRandomSampler(self.val_indeces) + return DataLoader(self, num_workers=8, batch_size=self.valid_bs, sampler=val_sampler) + + def get_train_data_size(self): + """ + Information for aggregation + """ + return len(self.train_indeces) + + def get_valid_data_size(self): + """ + Information for aggregation + """ + return len(self.val_indeces) + +fed_dataset = KvasirSD(train_bs=4, valid_bs=8) +fed_dataset.shard_descriptor = dummy_shard_desc +for i, (sample, target) in enumerate(fed_dataset.get_train_loader()): + print(sample.shape) model_unet = UNet() optimizer_adam = optim.Adam(model_unet.parameters(), lr=1e-4) -load_data() - framework_adapter = 'openfl.plugins.frameworks_adapters.pytorch_adapter.FrameworkAdapterPlugin' -model_interface = ModelInterface(model=model_unet, optimizer=optimizer_adam, - framework_plugin=framework_adapter) +MI = ModelInterface(model=model_unet, optimizer=optimizer_adam, framework_plugin=framework_adapter) # Save the initial model state initial_model = deepcopy(model_unet) -fed_dataset = FedDataset(KvasirDataset, train_bs=8, valid_bs=8) -federation = Federation(central_node_fqdn=getfqdn(), tls=False) - -# First number which is a collaborators rank is also passed as a cuda device identifier -col_data_paths = {'one': '1,2', - 'two': '2,2'} -federation.register_collaborators(col_data_paths=col_data_paths) -fl_experiment = FLExperiment(federation=federation) - -# If I use autoreload I got a pickling error -arch_path = fl_experiment.prepare_workspace_distribution( - model_provider=model_interface, - task_keeper=task_interface, - data_loader=fed_dataset, - rounds_to_train=7, - opt_treatment='CONTINUE_GLOBAL' -) - -# run_experiment(col_data_paths, model_interface, arch_path, fl_experiment) -# -# best_model = fl_experiment.get_best_model() -# fed_dataset._delayed_init() -# -# logger.info('Validating initial model') -# validate(initial_model, fed_dataset.get_valid_loader(), 'cpu') -# -# logger.info('Validating trained model') -# validate(best_model, fed_dataset.get_valid_loader(), 'cpu') +# create an experimnet in federation +experiment_name = 'kvasir_test_experiment' +fl_experiment = FLExperiment(federation=federation, experiment_name=experiment_name) + +fl_experiment.start(model_provider=MI, + task_keeper=task_interface, + data_loader=fed_dataset, + rounds_to_train=2, + opt_treatment='CONTINUE_GLOBAL') +fl_experiment.stream_metrics() +best_model = fl_experiment.get_best_model() +fl_experiment.remove_experiment_data() +best_model.inc.conv[0].weight +validate(initial_model, fed_dataset.get_valid_loader(), 'cpu') +validate(best_model, fed_dataset.get_valid_loader(), 'cpu') \ No newline at end of file diff --git a/tests/github/interactive_api/layers.py b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/layers.py similarity index 88% rename from tests/github/interactive_api/layers.py rename to tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/layers.py index 73b2eee2fe..5165dcc97e 100644 --- a/tests/github/interactive_api/layers.py +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/layers.py @@ -26,12 +26,12 @@ def soft_dice_coef(output, target): return score.sum() -class double_conv(nn.Module): +class DoubleConv(nn.Module): """Pytorch double conv class.""" def __init__(self, in_ch, out_ch): """Initialize layer.""" - super(double_conv, self).__init__() + super(DoubleConv, self).__init__() self.in_ch = in_ch self.out_ch = out_ch self.conv = nn.Sequential( @@ -49,15 +49,15 @@ def forward(self, x): return x -class down(nn.Module): +class Down(nn.Module): """Pytorch nn module subclass.""" def __init__(self, in_ch, out_ch): """Initialize layer.""" - super(down, self).__init__() + super(Down, self).__init__() self.mpconv = nn.Sequential( nn.MaxPool2d(2), - double_conv(in_ch, out_ch) + DoubleConv(in_ch, out_ch) ) def forward(self, x): @@ -66,23 +66,23 @@ def forward(self, x): return x -class up(nn.Module): +class Up(nn.Module): """Pytorch nn module subclass.""" def __init__(self, in_ch, out_ch, bilinear=False): """Initialize layer.""" - super(up, self).__init__() + super(Up, self).__init__() self.in_ch = in_ch self.out_ch = out_ch if bilinear: self.up = nn.Upsample( scale_factor=2, - mode="bilinear", + mode='bilinear', align_corners=True ) else: self.up = nn.ConvTranspose2d(in_ch, in_ch // 2, 2, stride=2) - self.conv = double_conv(in_ch, out_ch) + self.conv = DoubleConv(in_ch, out_ch) def forward(self, x1, x2): """Do forward pass.""" diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/model.py b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/model.py index a43e29148e..f8102c1ad4 100644 --- a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/model.py +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/model.py @@ -1,7 +1,7 @@ import torch import torch.nn as nn -from tests.github.interactive_api.layers import double_conv, down, up +from tests.github.interactive_api_director.experiments.pytorch_kvasir_unet.layers import DoubleConv, Down, Up """ UNet model definition @@ -11,15 +11,15 @@ class UNet(nn.Module): def __init__(self, n_channels=3, n_classes=1): super().__init__() - self.inc = double_conv(n_channels, 64) - self.down1 = down(64, 128) - self.down2 = down(128, 256) - self.down3 = down(256, 512) - self.down4 = down(512, 1024) - self.up1 = up(1024, 512) - self.up2 = up(512, 256) - self.up3 = up(256, 128) - self.up4 = up(128, 64) + self.inc = DoubleConv(n_channels, 64) + self.down1 = Down(64, 128) + self.down2 = Down(128, 256) + self.down3 = Down(256, 512) + self.down4 = Down(512, 1024) + self.up1 = Up(1024, 512) + self.up2 = Up(512, 256) + self.up3 = Up(256, 128) + self.up4 = Up(128, 64) self.outc = nn.Conv2d(64, n_classes, 1) def forward(self, x): diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/run.sh b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/run.sh new file mode 100644 index 0000000000..744513db0e --- /dev/null +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/run.sh @@ -0,0 +1,11 @@ +cd director +bash start_director.sh & + +sleep 3 +cd ../envoy +pip install -r sd_requirements.txt +python kvasir_shard_descriptor.py & + +sleep 2 +cd ../../../../../.. +python -m tests.github.interactive_api_director.experiments.pytorch_kvasir_unet.experiment diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/tasks.py b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/tasks.py index f895a23e7b..d788c531ff 100644 --- a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/tasks.py +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/tasks.py @@ -3,7 +3,7 @@ import numpy as np from openfl.interface.interactive_api.experiment import TaskInterface -from tests.github.interactive_api.layers import soft_dice_loss, soft_dice_coef +from tests.github.interactive_api_director.experiments.pytorch_kvasir_unet.layers import soft_dice_loss, soft_dice_coef task_interface = TaskInterface() diff --git a/tests/github/interactive_api_director/experiments/tensorflow_mnist/dataset.py b/tests/github/interactive_api_director/experiments/tensorflow_mnist/dataset.py index f100a0ebcb..ae4dd71bed 100644 --- a/tests/github/interactive_api_director/experiments/tensorflow_mnist/dataset.py +++ b/tests/github/interactive_api_director/experiments/tensorflow_mnist/dataset.py @@ -1,60 +1,62 @@ import tensorflow as tf +import numpy as np from openfl.interface.interactive_api.experiment import DataInterface class FedDataset(DataInterface): - def __init__(self, X_train, y_train, X_valid, y_valid, **kwargs): - self.X_train = X_train - self.y_train = y_train - self.X_valid = X_valid - self.y_valid = y_valid - self.batch_size = kwargs['batch_size'] - self.kwargs = kwargs - self._setup_datasets() - - def _setup_datasets(self): - self.train_dataset = tf.data.Dataset.from_tensor_slices((self.X_train, self.y_train)) - self.train_dataset = self.train_dataset.shuffle(buffer_size=1024).batch(self.batch_size) - self.valid_dataset = tf.data.Dataset.from_tensor_slices((self.X_valid, self.y_valid)) - self.valid_dataset = self.valid_dataset.shuffle(buffer_size=1024).batch(self.batch_size) - - def _delayed_init(self, data_path='1,1'): - # With the next command the local dataset will be loaded on the collaborator node - # For this example we have the same dataset on the same path, and we will shard it - # So we use `data_path` information for this purpose. - self.rank, self.world_size = [int(part) for part in data_path.split(',')] - - # Do the actual sharding - self._do_sharding(self.rank, self.world_size) - - def _do_sharding(self, rank, world_size): - self.X_train = self.X_train[rank - 1:: world_size] - self.y_train = self.y_train[rank - 1:: world_size] - self.X_valid = self.X_valid[rank - 1:: world_size] - self.y_valid = self.y_valid[rank - 1:: world_size] - self._setup_datasets() + def __init__(self, train_bs, valid_bs, **kwargs): + super().__init__(**kwargs) + self.train_bs = train_bs + self.valid_bs = valid_bs + + @property + def shard_descriptor(self): + return self._shard_descriptor + + @shard_descriptor.setter + def shard_descriptor(self, shard_descriptor): + """ + Describe per-collaborator procedures or sharding. + + This method will be called during a collaborator initialization. + Local shard_descriptor will be set by Envoy. + """ + self._shard_descriptor = shard_descriptor + validation_size = len(self.shard_descriptor) // 10 + self.train_indices = np.arange(len(self.shard_descriptor) - validation_size) + self.val_indices = np.arange(len(self.shard_descriptor) - validation_size, len(self.shard_descriptor)) def get_train_loader(self, **kwargs): """ Output of this method will be provided to tasks with optimizer in contract """ - return self.train_dataset + samples, targets = [], [] + for i in self.train_indices: + sample, target = self.shard_descriptor[i] + samples.append(sample) + targets.append(target) + samples = np.array(samples) + targets = np.array(targets) + return tf.data.Dataset.from_tensor_slices((samples, targets)).batch(self.train_bs) def get_valid_loader(self, **kwargs): """ Output of this method will be provided to tasks without optimizer in contract """ - return self.valid_dataset + samples, targets = zip(*[self.shard_descriptor[i] for i in self.val_indices]) + samples = np.array(samples) + targets = np.array(targets) + return tf.data.Dataset.from_tensor_slices((samples, targets)).batch(self.valid_bs) def get_train_data_size(self): """ Information for aggregation """ - return len(self.X_train) + return len(self.train_indices) def get_valid_data_size(self): """ Information for aggregation """ - return len(self.X_valid) + return len(self.val_indices) diff --git a/tests/github/interactive_api_director/experiments/tensorflow_mnist/director/config.yaml b/tests/github/interactive_api_director/experiments/tensorflow_mnist/director/config.yaml new file mode 100644 index 0000000000..7211f71161 --- /dev/null +++ b/tests/github/interactive_api_director/experiments/tensorflow_mnist/director/config.yaml @@ -0,0 +1,11 @@ +# Copyright (C) 2020-2021 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# Director's config. +# Parameters: +# 1. sample_shape - sample shape interface unified across the Federation +# 1. target_shape - target shape interface unified across the Federation + +settings: + sample_shape: ['784'] + target_shape: [] \ No newline at end of file diff --git a/tests/github/interactive_api_director/experiments/tensorflow_mnist/director/start_director.sh b/tests/github/interactive_api_director/experiments/tensorflow_mnist/director/start_director.sh new file mode 100644 index 0000000000..8616553963 --- /dev/null +++ b/tests/github/interactive_api_director/experiments/tensorflow_mnist/director/start_director.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +fx director start --disable-tls -c config.yaml \ No newline at end of file diff --git a/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/shard_config.yaml b/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/shard_config.yaml new file mode 100644 index 0000000000..d6dc9bccc6 --- /dev/null +++ b/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/shard_config.yaml @@ -0,0 +1,3 @@ +template: shard_descriptor.MNISTShardDescriptor +params: + rank_worldsize: 1,90 \ No newline at end of file diff --git a/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/shard_descriptor.py b/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/shard_descriptor.py new file mode 100644 index 0000000000..e8b4ad8758 --- /dev/null +++ b/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/shard_descriptor.py @@ -0,0 +1,92 @@ +# Copyright (C) 2020-2021 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""Kvasir shard descriptor.""" + +import numpy as np +from tensorflow import keras + +from openfl.interface.interactive_api.shard_descriptor import ShardDescriptor + + +class MNISTShardDescriptor(ShardDescriptor): + """Shard descriptor class.""" + + def __init__(self, rank_worldsize: str = '1,1') -> None: + """Initialize KvasirShardDescriptor.""" + super().__init__() + + (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data() + x_train = np.reshape(x_train, (-1, 784)) + x_test = np.reshape(x_test, (-1, 784)) + self.rank, self.worldsize = tuple(int(num) for num in rank_worldsize.split(',')) + + # Sharding + self.X_train = x_train[self.rank - 1::self.worldsize] + self.y_train = y_train[self.rank - 1::self.worldsize] + self.X_test = x_test[self.rank - 1::self.worldsize] + self.y_test = y_test[self.rank - 1::self.worldsize] + + + # Calculating data and target shapes + sample, target = self[0] + self._sample_shape = [str(dim) for dim in sample.shape] + self._target_shape = [str(dim) for dim in target.shape] + + + def __getitem__(self, index): + """Return a item by the index.""" + if index < len(self.X_train): + return self.X_train[index], self.y_train[index] + index -= len(self.X_train) + 1 + return self.X_test[index], self.y_test[index] + + def __len__(self): + """Return the len of the dataset.""" + return len(self.X_train) + len(self.X_test) + + @property + def sample_shape(self): + """Return the sample shape info.""" + return self._sample_shape + + @property + def target_shape(self): + """Return the target shape info.""" + return self._target_shape + + @property + def dataset_description(self) -> str: + """Return the dataset description.""" + return (f'MNIST dataset, shard number {self.rank}' + f' out of {self.worldsize}') + + +if __name__ == '__main__': + from openfl.interface.cli import setup_logging + setup_logging() + + data_folder = 'data' + rank_worldsize = '1,100' + + mnist_sd = MNISTShardDescriptor( + rank_worldsize=rank_worldsize) + + print(mnist_sd.dataset_description) + print(mnist_sd.sample_shape, mnist_sd.target_shape) + + from openfl.component.envoy.envoy import Envoy + + shard_name = 'one' + director_uri = 'localhost:50051' + + keeper = Envoy( + shard_name=shard_name, + director_uri=director_uri, + shard_descriptor=mnist_sd, + tls=False, + root_ca='./cert/root_ca.crt', + key='./cert/one.key', + cert='./cert/one.crt', + ) + + keeper.start() diff --git a/tests/github/interactive_api_director/experiments/tensorflow_mnist/experiment.py b/tests/github/interactive_api_director/experiments/tensorflow_mnist/experiment.py index b0c30d1d0e..7d06ab4921 100644 --- a/tests/github/interactive_api_director/experiments/tensorflow_mnist/experiment.py +++ b/tests/github/interactive_api_director/experiments/tensorflow_mnist/experiment.py @@ -1,143 +1,127 @@ -import logging -from time import sleep -import getpass - +import time import tensorflow as tf - -from openfl.interface.interactive_api.experiment import ModelInterface, FLExperiment +# Create a federation from openfl.interface.interactive_api.federation import Federation -# from openfl.services.tests.experiment_runner import run_experiment -from tests.github.interactive_api_director.experiment_runner import run_federation -from tests.github.interactive_api_director.experiment_runner import stop_federation -from tests.github.interactive_api_director.experiment_runner import Shard -from tests.github.interactive_api_director.experiment_runner import create_federation -from openfl.transport.grpc.director_client import DirectorClient -from tests.github.interactive_api.experiments.tensorflow_mnist.settings import model, optimizer, X_train, y_train, X_valid, y_valid, batch_size -from tests.github.interactive_api.experiments.tensorflow_mnist.dataset import FedDataset -from tests.github.interactive_api.experiments.tensorflow_mnist.tasks import train, validate, task_interface - -logger = logging.getLogger(__name__) - - -# create federation -col_names = ['one', 'two'] -username = getpass.getuser() -director_path = f'/home/{username}/test/exp_1/director' - -director_host = 'localhost' -director_port = 50051 - -shards = { - f'/home/{username}/test/exp_1/{col_name}': - Shard( - shard_name=col_name, - director_host=director_host, - director_port=director_port, - data_path=f'/home/{username}/test/data/{col_name}' - ) - for col_name in col_names -} - -create_federation(director_path, shards.keys()) - -processes = run_federation(shards, director_path) - -input('Please enter to run first experiment') - -experiment_name = 'tensorflow_mnist' -# Describing FL experiment -framework_adapter = 'openfl.plugins.frameworks_adapters.keras_adapter.FrameworkAdapterPlugin' -model_interface = ModelInterface( - model=model, optimizer=optimizer, framework_plugin=framework_adapter) - -# Register dataset -fed_dataset = FedDataset(X_train, y_train, X_valid, y_valid, batch_size=batch_size) - -# Perform model warm up -# The model warmup is necessary to initialize weights when using Tensorflow Gradient Tape +from openfl.interface.interactive_api.experiment import TaskInterface, DataInterface, ModelInterface, FLExperiment +from tests.github.interactive_api_director.experiments.tensorflow_mnist.dataset import FedDataset +from tests.github.interactive_api_director.experiments.tensorflow_mnist.settings import model +from tests.github.interactive_api_director.experiments.tensorflow_mnist.settings import optimizer +from tests.github.interactive_api_director.experiments.tensorflow_mnist.settings import loss_fn +from tests.github.interactive_api_director.experiments.tensorflow_mnist.settings import train_acc_metric +from tests.github.interactive_api_director.experiments.tensorflow_mnist.settings import val_acc_metric +from tests.github.interactive_api_director.experiments.tensorflow_mnist.envoy.shard_descriptor import MNISTShardDescriptor +from copy import deepcopy + + +# please use the same identificator that was used in signed certificate +client_id = 'frontend' + +# 1) Run with API layer - Director mTLS +# If the user wants to enable mTLS their must provide CA root chain, and signed key pair to the federation interface +# cert_chain = 'cert/root_ca.crt' +# API_certificate = 'cert/frontend.crt' +# API_private_key = 'cert/frontend.key' + +# federation = Federation(client_id='frontend', director_node_fqdn='localhost', director_port='50051', +# cert_chain=cert_chain, api_cert=API_certificate, api_private_key=API_private_key) + +# -------------------------------------------------------------------------------------------------------------------- + +# 2) Run with TLS disabled (trusted environment) +# Federation can also determine local fqdn automatically +federation = Federation(client_id=client_id, director_node_fqdn='localhost', director_port='50051', tls=False) + +shard_registry = federation.get_shard_registry() +print(shard_registry) +print(federation.target_shape) +fed_dataset = FedDataset(train_bs=4, valid_bs=8) +fed_dataset.shard_descriptor = MNISTShardDescriptor() +for batch in fed_dataset.get_train_loader(): + samples, _ = batch + for sample in samples: + print(sample.shape) -train(model, fed_dataset.get_train_loader(), optimizer, 'cpu', warmup=True) -#Make a copy of the model for later comparison +framework_adapter = 'openfl.plugins.frameworks_adapters.keras_adapter.FrameworkAdapterPlugin' +MI = ModelInterface(model=model, optimizer=optimizer, framework_plugin=framework_adapter) + + +def function_defined_in_notebook(some_parameter): + print(f'Also I accept a parameter and it is {some_parameter}') + + +TI = TaskInterface() +# Task interface currently supports only standalone functions. +@TI.register_fl_task(model='model', data_loader='train_dataset', + device='device', optimizer='optimizer') +def train(model, train_dataset, optimizer, device, loss_fn=loss_fn, warmup=False): + + # Iterate over the batches of the dataset. + for step, (x_batch_train, y_batch_train) in enumerate(train_dataset): + with tf.GradientTape() as tape: + logits = model(x_batch_train, training=True) + loss_value = loss_fn(y_batch_train, logits) + grads = tape.gradient(loss_value, model.trainable_weights) + optimizer.apply_gradients(zip(grads, model.trainable_weights)) + + # Update training metric. + train_acc_metric.update_state(y_batch_train, logits) + + # Log every 200 batches. + if step % 200 == 0: + print( + "Training loss (for one batch) at step %d: %.4f" + % (step, float(loss_value)) + ) + print("Seen so far: %d samples" % ((step + 1) * 64)) + if warmup: + break + + # Display metrics at the end of each epoch. + train_acc = train_acc_metric.result() + print("Training acc over epoch: %.4f" % (float(train_acc),)) + + # Reset training metrics at the end of each epoch + train_acc_metric.reset_states() + + return {'train_acc': train_acc} + + +@TI.register_fl_task(model='model', data_loader='val_dataset', device='device') +def validate(model, val_dataset, device): + # Run a validation loop at the end of each epoch. + for x_batch_val, y_batch_val in val_dataset: + val_logits = model(x_batch_val, training=False) + # Update val metrics + val_acc_metric.update_state(y_batch_val, val_logits) + val_acc = val_acc_metric.result() + val_acc_metric.reset_states() + print("Validation acc: %.4f" % (float(val_acc),)) + + return {'validation_accuracy': val_acc,} +# Save the initial model state +train(model,fed_dataset.get_train_loader(), optimizer, 'cpu', warmup=True) initial_model = tf.keras.models.clone_model(model) -# Prepare Federated Dataset for Serialization -# tf.data.DataSet does not serialize well with pickle. -# It will be recreated on the collaborators with the delayed init function -fed_dataset.train_dataset = None -fed_dataset.valid_dataset = None - -# Start a federated learning experiment +# The Interactive API supports registering functions definied in main module or imported. -# Create a federation -# will determine fqdn by itself -federation = Federation(director_node_fqdn='localhost', tls=False) -# Datapath corresonds to 'RANK,WORLD_SIZE' -col_data_paths = { - 'one': '1,2', - 'two': '2,2' -} -# federation.register_collaborators(col_data_paths=col_data_paths) # create an experimnet in federation -fl_experiment = FLExperiment(federation=federation) - +experiment_name = 'mnist_test_experiment' +fl_experiment = FLExperiment(federation=federation, experiment_name=experiment_name) # If I use autoreload I got a pickling error -arch_path = fl_experiment.prepare_workspace_distribution( - model_provider=model_interface, - task_keeper=task_interface, - data_loader=fed_dataset, - rounds_to_train=7, - opt_treatment='CONTINUE_GLOBAL' -) - - -sleep(2) -director_client = DirectorClient( - director_host=director_host, - director_port=director_port -) -resp = director_client.set_new_experiment(experiment_name, col_names, arch_path, - model_interface, fl_experiment) -logger.info(f'Response from director: {resp}') - -# fl_experiment.start_experiment(model_interface) - -# best_model = fl_experiment.get_best_model() -# fed_dataset._delayed_init() -# -# logger.info('Validating initial model') -# validate(initial_model, fed_dataset.get_valid_loader(), 'cpu') -# -# logger.info('Validating trained model') -# validate(best_model, fed_dataset.get_valid_loader(), 'cpu') - -while True: - sleep(1) - -input('Press Enter to run second experiment') - -# Second experiment - -from tests.github.interactive_api_director.experiments.pytorch_kvasir_unet.experiment import ( - arch_path, validate, fed_dataset, fl_experiment, model_interface, initial_model) - -resp = director_client.set_new_experiment('pytorch_kvasir_unet', col_names, arch_path) -logger.info(f'Response from director: {resp}') - -fl_experiment.start_experiment(model_interface) +# The following command zips the workspace and python requirements to be transfered to collaborator nodes +fl_experiment.start(model_provider=MI, + task_keeper=TI, + data_loader=fed_dataset, + rounds_to_train=2, + opt_treatment='CONTINUE_GLOBAL') +fl_experiment.stream_metrics() best_model = fl_experiment.get_best_model() -fed_dataset._delayed_init() - -logger.info('Validating initial model') +fl_experiment.remove_experiment_data() validate(initial_model, fed_dataset.get_valid_loader(), 'cpu') -validate(initial_model, fed_dataset.get_valid_loader(), 'cpu') - -logger.info('Validating trained model') validate(best_model, fed_dataset.get_valid_loader(), 'cpu') - -stop_federation(processes) diff --git a/tests/github/interactive_api_director/experiments/tensorflow_mnist/run.sh b/tests/github/interactive_api_director/experiments/tensorflow_mnist/run.sh new file mode 100644 index 0000000000..635b111b57 --- /dev/null +++ b/tests/github/interactive_api_director/experiments/tensorflow_mnist/run.sh @@ -0,0 +1,10 @@ +cd director +bash start_director.sh & + +sleep 3 +cd ../envoy +python shard_descriptor.py & + +sleep 2 +cd ../../../../../.. +python -m tests.github.interactive_api_director.experiments.tensorflow_mnist.experiment diff --git a/tests/github/interactive_api_director/experiments/tensorflow_mnist/settings.py b/tests/github/interactive_api_director/experiments/tensorflow_mnist/settings.py index 9e530788d6..1ad7fa5151 100644 --- a/tests/github/interactive_api_director/experiments/tensorflow_mnist/settings.py +++ b/tests/github/interactive_api_director/experiments/tensorflow_mnist/settings.py @@ -16,7 +16,7 @@ optimizer = keras.optimizers.SGD(learning_rate=1e-3) # Instantiate a loss function. loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True) - +model.compile(optimizer, loss_fn) # Prepare the metrics. train_acc_metric = keras.metrics.SparseCategoricalAccuracy() val_acc_metric = keras.metrics.SparseCategoricalAccuracy()