diff --git a/Dockerfile b/Dockerfile
index 7c7eb94..337ec44 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -2,8 +2,8 @@ FROM centos/python-36-centos7:latest
USER root
RUN ls
-WORKDIR /app
-COPY . /app
+WORKDIR /FL
+COPY . /FL
RUN pip3 install --upgrade pip
RUN pip3 install --upgrade setuptools
@@ -13,4 +13,4 @@ RUN source venv/bin/activate
RUN pip3 install -q -r requirements.txt
RUN pip3 install federated-learning-lib/federated_learning_lib-*-py3-none-any.whl
-CMD ["/bin/bash"]
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/Notebooks/images/FL_Network.png b/Notebooks/images/FL_Network.png
index 709c8e9..d656229 100644
Binary files a/Notebooks/images/FL_Network.png and b/Notebooks/images/FL_Network.png differ
diff --git a/Notebooks/images/MnistExamples.png b/Notebooks/images/MnistExamples.png
index 7ddd3b1..af8c966 100644
Binary files a/Notebooks/images/MnistExamples.png and b/Notebooks/images/MnistExamples.png differ
diff --git a/Notebooks/images/adult8P.png b/Notebooks/images/adult8P.png
index 61617aa..627f3cb 100644
Binary files a/Notebooks/images/adult8P.png and b/Notebooks/images/adult8P.png differ
diff --git a/Notebooks/images/arch_aggregator.png b/Notebooks/images/arch_aggregator.png
index 5d548bb..c6114ff 100644
Binary files a/Notebooks/images/arch_aggregator.png and b/Notebooks/images/arch_aggregator.png differ
diff --git a/Notebooks/images/arch_party.png b/Notebooks/images/arch_party.png
index d979249..d7e6d8c 100644
Binary files a/Notebooks/images/arch_party.png and b/Notebooks/images/arch_party.png differ
diff --git a/Notebooks/sklearn_logclassification_rw/sklearn_logclassification_rw_agg.ipynb b/Notebooks/sklearn_logclassification_rw/sklearn_logclassification_rw_agg.ipynb
index 5622728..45ed288 100644
--- a/Notebooks/sklearn_logclassification_rw/sklearn_logclassification_rw_agg.ipynb
+++ b/Notebooks/sklearn_logclassification_rw/sklearn_logclassification_rw_agg.ipynb
@@ -74,17 +74,17 @@
"source": [
"## Fairness Techniques \n",
"\n",
- "We adapt a centralized pre-processing bias mitigation method [Reweighing](https://link.springer.com/article/10.1007/s10115-011-0463-8) into two federated learning techniques, Local Reweighing and Global Reweighing with Differential Privacy.\n",
+ "[Reweighing](https://link.springer.com/article/10.1007/s10115-011-0463-8) is a centralized pre-processing bias mitigation method, which works primarily by attaching weights to samples in the training dataset. This method accesses the entire training dataset and computes weights as the ratio of the expected probability to the observed probability of the sample, calculated based on the sensitive attribute/label pairing in question. We adapt this centralized method into two federated learning techniques, Local Reweighing and Global Reweighing with Differential Privacy.\n",
"\n",
"**Local reweighing**: To fully protect parties' data privacy, each party computes reweighing weights locally based on its own training dataset during pre-processing and then uses the reweighing dataset for its local training. Therefore, parties do not need to communicate with the aggregator or reveal their sensitive attributes and data sample information.\n",
"\n",
"**Global Reweighing with Differential Privacy**: If parties agree to share sensitive attributes and noisy data statistics, parties can employ this fairness method. During the pre-processing phase, the aggregator will collect statistics such as the noisy number of samples with privileged attribute values, compute global reweighing weights based on the collected statistics, and share them with parties. By adjusting the amount of noise injected via epsilon, parties can control their data leakage while still mitigating bias. \n",
"\n",
- "We also adapt an in-processing bias mitigation method into Federated Prejudice Remover.\n",
+ "[Prejudice Remover](https://github.com/algofairness/fairness-comparison/tree/master/fairness/algorithms/kamishima) is an in-processing bias mitigation method 440 proposed for centralized ML, which works by adding a fairness-aware regularizer to the regular logistic loss function. We adapt this centralized method into Federated Prejudice Remover.\n",
"\n",
- "**Federated Prejudice Removal**: Each party applies the [Prejudice Remover algorithm](https://github.com/algofairness/fairness-comparison/tree/master/fairness/algorithms/kamishima) to train a less biased local model, and shares only the model parameters with the aggregator. The aggregator can then employ existing FL algorithms, like simple average and FedAvg, etc., to update the global model.\n",
+ "**Federated Prejudice Removal**: Each party applies the Prejudice Remover algorithm to train a less biased local model, and shares only the model parameters with the aggregator. The aggregator can then employ existing FL algorithms, like simple average and FedAvg, etc., to update the global model.\n",
"\n",
- "Further details about the algorithms and datasets utilized, as well as experimental setup, are included in the paper titled [Mitigating Bias in Federated Learning](https://arxiv.org/abs/2012.02447)."
+ "Further details about the algorithms and datasets utilized, as well as experimental setup, are included in our [paper](https://arxiv.org/abs/2012.02447)."
]
},
{
@@ -958,7 +958,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.10"
+ "version": "3.6.12"
}
},
"nbformat": 4,
diff --git a/Notebooks/sklearn_logclassification_rw/sklearn_logclassification_rw_p0.ipynb b/Notebooks/sklearn_logclassification_rw/sklearn_logclassification_rw_p0.ipynb
index 5af2b32..b9e0863 100644
--- a/Notebooks/sklearn_logclassification_rw/sklearn_logclassification_rw_p0.ipynb
+++ b/Notebooks/sklearn_logclassification_rw/sklearn_logclassification_rw_p0.ipynb
@@ -62,17 +62,17 @@
"source": [
"## Fairness Techniques \n",
"\n",
- "We adapt a centralized pre-processing bias mitigation method [Reweighing](https://link.springer.com/article/10.1007/s10115-011-0463-8) into two federated learning techniques, Local Reweighing and Global Reweighing with Differential Privacy.\n",
+ "[Reweighing](https://link.springer.com/article/10.1007/s10115-011-0463-8) is a centralized pre-processing bias mitigation method, which works primarily by attaching weights to samples in the training dataset. This method accesses the entire training dataset and computes weights as the ratio of the expected probability to the observed probability of the sample, calculated based on the sensitive attribute/label pairing in question. We adapt this centralized method into two federated learning techniques, Local Reweighing and Global Reweighing with Differential Privacy.\n",
"\n",
"**Local reweighing**: To fully protect parties' data privacy, each party computes reweighing weights locally based on its own training dataset during pre-processing and then uses the reweighing dataset for its local training. Therefore, parties do not need to communicate with the aggregator or reveal their sensitive attributes and data sample information.\n",
"\n",
"**Global Reweighing with Differential Privacy**: If parties agree to share sensitive attributes and noisy data statistics, parties can employ this fairness method. During the pre-processing phase, the aggregator will collect statistics such as the noisy number of samples with privileged attribute values, compute global reweighing weights based on the collected statistics, and share them with parties. By adjusting the amount of noise injected via epsilon, parties can control their data leakage while still mitigating bias. \n",
"\n",
- "We also adapt an in-processing bias mitigation method into Federated Prejudice Remover.\n",
+ "[Prejudice Remover](https://github.com/algofairness/fairness-comparison/tree/master/fairness/algorithms/kamishima) is an in-processing bias mitigation method 440 proposed for centralized ML, which works by adding a fairness-aware regularizer to the regular logistic loss function. We adapt this centralized method into Federated Prejudice Remover.\n",
"\n",
- "**Federated Prejudice Removal**: Each party applies the [Prejudice Remover algorithm](https://github.com/algofairness/fairness-comparison/tree/master/fairness/algorithms/kamishima) to train a less biased local model, and shares only the model parameters with the aggregator. The aggregator can then employ existing FL algorithms, like simple average and FedAvg, etc., to update the global model.\n",
+ "**Federated Prejudice Removal**: Each party applies the Prejudice Remover algorithm to train a less biased local model, and shares only the model parameters with the aggregator. The aggregator can then employ existing FL algorithms, like simple average and FedAvg, etc., to update the global model.\n",
"\n",
- "Further details about the algorithms and datasets utilized, as well as experimental setup, are included in the paper titled [Mitigating Bias in Federated Learning](https://arxiv.org/abs/2012.02447)."
+ "Further details about the algorithms and datasets utilized, as well as experimental setup, are included in our [paper](https://arxiv.org/abs/2012.02447)."
]
},
{
@@ -440,7 +440,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.10"
+ "version": "3.6.12"
}
},
"nbformat": 4,
diff --git a/Notebooks/sklearn_logclassification_rw/sklearn_logclassification_rw_p1.ipynb b/Notebooks/sklearn_logclassification_rw/sklearn_logclassification_rw_p1.ipynb
index b693397..8ff2b0c 100644
--- a/Notebooks/sklearn_logclassification_rw/sklearn_logclassification_rw_p1.ipynb
+++ b/Notebooks/sklearn_logclassification_rw/sklearn_logclassification_rw_p1.ipynb
@@ -62,17 +62,17 @@
"source": [
"## Fairness Techniques \n",
"\n",
- "We adapt a centralized pre-processing bias mitigation method [Reweighing](https://link.springer.com/article/10.1007/s10115-011-0463-8) into two federated learning techniques, Local Reweighing and Global Reweighing with Differential Privacy.\n",
+ "[Reweighing](https://link.springer.com/article/10.1007/s10115-011-0463-8) is a centralized pre-processing bias mitigation method, which works primarily by attaching weights to samples in the training dataset. This method accesses the entire training dataset and computes weights as the ratio of the expected probability to the observed probability of the sample, calculated based on the sensitive attribute/label pairing in question. We adapt this centralized method into two federated learning techniques, Local Reweighing and Global Reweighing with Differential Privacy.\n",
"\n",
"**Local reweighing**: To fully protect parties' data privacy, each party computes reweighing weights locally based on its own training dataset during pre-processing and then uses the reweighing dataset for its local training. Therefore, parties do not need to communicate with the aggregator or reveal their sensitive attributes and data sample information.\n",
"\n",
"**Global Reweighing with Differential Privacy**: If parties agree to share sensitive attributes and noisy data statistics, parties can employ this fairness method. During the pre-processing phase, the aggregator will collect statistics such as the noisy number of samples with privileged attribute values, compute global reweighing weights based on the collected statistics, and share them with parties. By adjusting the amount of noise injected via epsilon, parties can control their data leakage while still mitigating bias. \n",
"\n",
- "We also adapt an in-processing bias mitigation method into Federated Prejudice Remover.\n",
+ "[Prejudice Remover](https://github.com/algofairness/fairness-comparison/tree/master/fairness/algorithms/kamishima) is an in-processing bias mitigation method 440 proposed for centralized ML, which works by adding a fairness-aware regularizer to the regular logistic loss function. We adapt this centralized method into Federated Prejudice Remover.\n",
"\n",
- "**Federated Prejudice Removal**: Each party applies the [Prejudice Remover algorithm](https://github.com/algofairness/fairness-comparison/tree/master/fairness/algorithms/kamishima) to train a less biased local model, and shares only the model parameters with the aggregator. The aggregator can then employ existing FL algorithms, like simple average and FedAvg, etc., to update the global model.\n",
+ "**Federated Prejudice Removal**: Each party applies the Prejudice Remover algorithm to train a less biased local model, and shares only the model parameters with the aggregator. The aggregator can then employ existing FL algorithms, like simple average and FedAvg, etc., to update the global model.\n",
"\n",
- "Further details about the algorithms and datasets utilized, as well as experimental setup, are included in the paper titled [Mitigating Bias in Federated Learning](https://arxiv.org/abs/2012.02447)."
+ "Further details about the algorithms and datasets utilized, as well as experimental setup, are included in our [paper](https://arxiv.org/abs/2012.02447)."
]
},
{
@@ -435,7 +435,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.10"
+ "version": "3.6.12"
}
},
"nbformat": 4,
diff --git a/README.md b/README.md
index dc8e3ed..2177693 100644
--- a/README.md
+++ b/README.md
@@ -36,8 +36,9 @@ For a particular ML model, you can select multiple types of fusion algorithms:
| | Coordinate-wise median [Yin et al.](https://arxiv.org/pdf/1803.01498.pdf) |
| | Zeno [Xie et al.](https://arxiv.org/abs/1805.10032) |
| | SPAHM [Yurochkin et al.](https://arxiv.org/abs/1911.00218) |
-| | Fed+ [Yu et al.](https://arxiv.org/abs/2009.06303) |
-| | FedProx: [Tian Li et al.](https://arxiv.org/pdf/1812.06127.pdf). |
+| | Fed+ [Yu et al.](https://arxiv.org/abs/2009.06303) |
+| | FedProx: [Tian Li et al.](https://arxiv.org/pdf/1812.06127.pdf) |
+| | Shuffle Iterative Average [Cheng et al.](https://arxiv.org/pdf/2105.09400.pdf)|
| ID3 Decision Tree | ID3 fusion [Quinlan](https://link.springer.com/article/10.1007/BF00116251) |
| Reinforcement Learning RLLib models | Iterative Average |
| | FedAvg [McMahan et al.](https://arxiv.org/pdf/1602.05629.pdf) |
@@ -53,8 +54,9 @@ We also support the following fairness techniques that help to mitigate bias in
| Global Reweighing with Differetial Privacy [Abay et al.](https://arxiv.org/abs/2012.02447)| Pre-processing | All ML models |
| Federated Prejudice Removal [Abay et al.](https://arxiv.org/abs/2012.02447) | In-processing | Logistic Regression |
-In order to aid orchestration of Federated Learning experiments using the IBMFL library, we also provide a Jupyter Notebook based UI interface, [Experiment Manager Dashboard](runner/exp_manager/Experiment_Manager_dashboard.ipynb) where users can choose the model, fusion algorithm, number of parties and other (hyper) parameters for a run. This orchestration can be done on the machine where the notebook is hosted, i.e., locally or even across remote machines. As of now, only limited models and datasets are supported, but more will be added in the near future.
-The dashboard uses a [runner](runner/) module, and there's a [usage guide](runner/exp_manager/usage_guide.md) on how to go about using the dashboard.
+In order to aid orchestration of Federated Learning experiments using the IBMFL library, we also provide a Jupyter Notebook based UI interface, [Experiment Manager Dashboard](experiment_manager/Experiment_Manager_dashboard.ipynb) where users can choose the model, fusion algorithm, number of parties and other (hyper) parameters for a run. This orchestration can be done on the machine where the notebook is hosted, i.e., locally or even across remote machines. The usage guide on how to go about using the dashboard can be found [here](experiment_manager/usage_guide.md).
+
+IBMFL Multi-Cloud and Hybrid Cloud Orchestrator automates the deployment and monitoring of aggregator and party process using federated learning library docker image on OpenShift clusters which are setup on different cloud data center regions. For more information on how to use OpenShift Orchestrator please refer to [README](openshift_fl/README.md).
## How to get started?
@@ -64,6 +66,10 @@ Try the [set-up guide](setup.md) for a single-node federated learning setup.
There are a number of [examples](examples/README.md) with explanation for different federated learning tasks with different model types to get started with.
+Try our experiment manager [here](experiment_manager).
+
+Try IBM FL with OpenShift [here](openshift_fl).
+
## How does it work?
There is a [docs folder](./docs) with tutorials and API documentation to learn how to use and extend IBM federated learning.
@@ -78,7 +84,7 @@ There is a [docs folder](./docs) with tutorials and API documentation to learn h
We appreciate feedback and questions. Please post issues when you encounter them.
-We have set up a Slack channel for ongoing discussion. Join the IBM federated learning workspace: https://ibm-fl.slack.com/
+We have set up a Slack channel for ongoing discussion. Join the IBM federated learning workspace: https://ibm-fl.slack.com/. If the previous link does not work for you, you can also use [this invitation link](https://join.slack.com/t/ibm-fl/shared_invite/zt-ff0k1xgh-IL9Aq6sW6rNny9gDdnEttQ).
## Citing IBM Federated Learning
diff --git a/examples/README.md b/examples/README.md
index e90f864..de1da6c 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -3,10 +3,13 @@
### Training Keras, TensorFlow 2.1 and PyTorch models with different fusion algorithms:
* [Running federated averaging (FedAvg)](fedavg)
* [Simple average](iter_avg)
-* [FedPlus with Keras](fedplus)
+* [Shuffle iterative average](shuffle_iter_avg)
+* [FedAvgPlus with Tensorflow and PyTorch](fedavgplus)
* [Gradient aggregation](gradient_aggregation)
* [PFNM with Keras](pfnm)
-* [Coordinate median with Keras](coordinate_median)
+* [Coordinate median](coordinate_median)
+* [Coordinate median plus](coordinate_median_plus)
+* [Geometric median plus](geometric_median_plus)
* [Krum with Keras](krum)
* [Zeno with Keras](zeno)
diff --git a/examples/constants.py b/examples/constants.py
index bfe41ce..a1d957d 100644
--- a/examples/constants.py
+++ b/examples/constants.py
@@ -2,7 +2,7 @@
DEFAULT_SERVER = 'default'
# Examples helper descriptions
-GENERATE_DATA_DESC = "generates data for running IBM FL examples"
+GENERATE_DATA_DESC = "generates data for running FL examples"
NUM_PARTIES_DESC = "the number of parties to split the data into"
DATASET_DESC = "which data set to use"
PATH_DESC = "directory to save the data"
@@ -19,18 +19,22 @@
MODEL_CONFIG_DESC = "which model to use for fusion example"
TASK_NAME_DESC = "task name, specified when using RabbitMQ connection"
-EXAMPLES_WARNING = "WARNING:: Usage of -m keras_classifier option is deprecated and replaced with -m keras -f iter_avg. Ref https://github.com/IBM/federated-learning-lib/blob/main/setup.md for more information"
-CONNECTION_TYPE_DESC = "type of connection to use; supported types are flask, rabbitmq and websockets"
+CONNECTION_TYPE_DESC = "type of connection to use; supported types are flask and rabbitmq"
+
+CONTEXT_PATH = "context directory to import the generate script from different folders other that examples"
# Integration
FL_DATASETS = ["default", "mnist", "nursery", "adult", "federated-clustering", "compas", "german",
- "higgs", "airline", "diabetes", "binovf", "multovf", "linovf", "femnist", "cifar10"]
-
+ "higgs", "airline", "diabetes", "binovf", "multovf", "linovf", "femnist", "cifar10", "custom_dataset"]
+
FL_EXAMPLES = ["iter_avg", "fedavg", "coordinate_median", "gradient_aggregation", "krum", "pfnm",
- "zeno", "fedprox", "fedplus", "differential_privacy_sgd", "rl_cartpole",
- "rl_pendulum", "sklearn_logclassification_rw", "spahm", "id3_dt", "prej_remover",
- "sklearn_logclassification_globalrw", "naive_bayes_dp"]
-
-FL_MODELS = ["keras", "pytorch", "tf", "sklearn", "None", "keras_classifier"]
+ "zeno", "fedprox", "fedavgplus",
+ "differential_privacy_sgd",
+ "rl_cartpole", "rl_pendulum", "sklearn_logclassification_rw", "spahm",
+ "sklearn_logclassification_globalrw", "naive_bayes_dp", "id3_dt", "prej_remover", "iter_avg_openshift", "shuffle_iter_avg",
+ "coordinate_median_plus", "geometric_median_plus"]
+FL_MODELS = ["keras", "pytorch", "tf", "sklearn", "None"]
FL_CONN_TYPES = ["flask", "rabbitmq"]
+
+FL_CONTEXT = {'openshift':'openshift_fl.examples'}
diff --git a/examples/coordinate_median/generate_configs.py b/examples/coordinate_median/generate_configs.py
index ac3a1fb..df49680 100644
--- a/examples/coordinate_median/generate_configs.py
+++ b/examples/coordinate_median/generate_configs.py
@@ -13,7 +13,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'LocalTrainingHandler',
'path': 'ibmfl.party.training.local_training_handler'
@@ -42,7 +42,7 @@ def get_hyperparams(model):
def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='keras'):
- SUPPORTED_DATASETS = ['mnist', 'adult']
+ SUPPORTED_DATASETS = ['mnist', 'custom_dataset']
if dataset in SUPPORTED_DATASETS:
if model not in 'keras':
dataset = dataset + "_" + model
diff --git a/examples/coordinate_median/model_keras.py b/examples/coordinate_median/model_keras.py
index 7a40478..ae25c9a 100644
--- a/examples/coordinate_median/model_keras.py
+++ b/examples/coordinate_median/model_keras.py
@@ -10,9 +10,6 @@ def get_hyperparams():
local_params = {
'training': {
'epochs': 3
- },
- 'optimizer': {
- 'lr': 0.01
}
}
diff --git a/examples/coordinate_median_plus/README.md b/examples/coordinate_median_plus/README.md
new file mode 100644
index 0000000..41dfb2e
--- /dev/null
+++ b/examples/coordinate_median_plus/README.md
@@ -0,0 +1,63 @@
+# Running Coordinate Median Plus (coordinate_median_plus) with TensorFLow
+
+**Coordinate Median Plus is one variation of Fed+ fusion algorithms proposed here: [Fed+: A Unified Approach to Robust Personalized Federated Learning](https://arxiv.org/pdf/2009.06303.pdf)**
+
+
+More variations of Fed+ can be at:
+
+1. [Fedavg_plus](../fedavgplus)
+2. [Geometric_median_plus](../geometric_median_plus)
+
+This example explains how to run coordinate median plus algorithm on CNNs implemented with TensorFlow training on
+[MNIST](http://yann.lecun.com/exdb/mnist/) data. Data in this example is preprocessed by scaling down to range from `[0, 255]` to `[0, 1]`.
+No other preprocessing is performed.
+
+## Setup FL
+
+- Split data by running:
+
+ ```
+ python examples/generate_data.py -n -d mnist -pp
+ ```
+For example, to generate sample data on MNIST dataset, you could run:
+```
+python examples/generate_data.py -n 2 -d mnist -pp 200
+```
+
+Run `python examples/generate_data.py -h` for full descriptions
+of the different options.
+
+- Generate config files by running:
+ ```
+ python examples/generate_configs.py -n -f coordinate_median_plus -m tf -d -p
+ ```
+
+To run FL, you must have configuration files for the aggregator and for each party.
+
+You can generate these config files using the `generate_configs.py` script.
+
+For example, you could run:
+
+```
+python examples/generate_configs.py -f coordinate_median_plus -m tf -n 2 -d mnist -p examples/data/mnist/random
+```
+
+This command would generate the configs for the `tf_classifier_mnist` model, assuming 2 parties.
+You must also specify the party data path.
+
+Run `python examples/generate_configs.py -h` for full descriptions of the different options.
+
+- In a terminal running an activated IBM FL environment
+(refer to Quickstart in our website to learn more about how to set up the running environment), start the aggregator by running:
+ ```
+ python -m ibmfl.aggregator.aggregator
+ ```
+ Type `START` and press enter to start accepting connections
+- In a terminal running an activated IBM FL environment, start each party by running:
+ ```
+ python -m ibmfl.party.party
+ ```
+ Type `START` and press enter to start accepting connections.
+
+ Type `REGISTER` and press enter to register the party with the aggregator.
+- Finally, start training by entering `TRAIN` in the aggregator terminal.
\ No newline at end of file
diff --git a/examples/coordinate_median_plus/generate_configs.py b/examples/coordinate_median_plus/generate_configs.py
new file mode 100644
index 0000000..28f1cee
--- /dev/null
+++ b/examples/coordinate_median_plus/generate_configs.py
@@ -0,0 +1,111 @@
+import os
+
+import tensorflow as tf
+from tensorflow.keras.layers import Dense, Flatten, Conv2D
+from tensorflow.keras import Model
+
+import examples.datahandlers as datahandlers
+
+
+def get_fusion_config():
+ fusion = {
+ 'name': 'CoordinateMedianFedplusFusionHandler',
+ 'path': 'ibmfl.aggregator.fusion.coordinate_median_fedplus_fusion_handler'
+ }
+ return fusion
+
+
+def get_local_training_config(configs_folder=None):
+ local_training_handler = {
+ 'name': 'CoordinateMedianFedPlusLocalTrainingHandler',
+ 'path': 'ibmfl.party.training.coordinate_median_fedplus_local_training_handler'
+ }
+ local_training_handler['info'] = {
+ 'alpha': 0.01,
+ 'rho': 10
+ }
+ return local_training_handler
+
+
+def get_hyperparams(model='keras'):
+ hyperparams = {
+ 'global': {
+ 'rounds': 3,
+ 'termination_accuracy': 0.83,
+ 'max_timeout': 600,
+ 'rho': 10
+ },
+ 'local': {
+ 'training': {
+ 'epochs': 10,
+ 'batch_size': 10
+ },
+ 'optimizer': {
+ 'lr': 0.0003
+ }
+ }
+ }
+
+ return hyperparams
+
+
+def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='tf'):
+ SUPPORTED_DATASETS = ['mnist', 'custom_dataset']
+ if dataset in SUPPORTED_DATASETS:
+ dataset = dataset + "_" + model
+
+ data = datahandlers.get_datahandler_config(
+ dataset, folder_data, party_id, is_agg)
+ else:
+ raise Exception(
+ "The dataset {} is a wrong combination for fusion/model".format(dataset))
+ return data
+
+
+def get_model_config(folder_configs, dataset, is_agg=False, party_id=0, model='tf'):
+ if is_agg:
+ return None
+
+ if model is None or model is 'default':
+ model = 'tf'
+
+ class MyModel(Model):
+ def __init__(self):
+ super(MyModel, self).__init__()
+ self.conv1 = Conv2D(32, 3, activation='relu')
+ self.flatten = Flatten()
+ self.d1 = Dense(128, activation='relu')
+ self.d2 = Dense(10)
+
+ def call(self, x):
+ x = self.conv1(x)
+ x = self.flatten(x)
+ x = self.d1(x)
+ return self.d2(x)
+
+ # Create an instance of the model
+ model = MyModel()
+ loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
+ from_logits=True)
+ optimizer = tf.keras.optimizers.Adam()
+ acc = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')
+ model.compile(optimizer=optimizer, loss=loss_object, metrics=[acc])
+ img_rows, img_cols = 28, 28
+ input_shape = (None, img_rows, img_cols, 1)
+ model.compute_output_shape(input_shape=input_shape)
+
+ if not os.path.exists(folder_configs):
+ os.makedirs(folder_configs)
+
+ model.save(folder_configs)
+
+ spec = {'model_name': 'tf-cnn',
+ 'model_definition': folder_configs}
+
+ model = {
+ 'name': 'TensorFlowFLModel',
+ 'path': 'ibmfl.model.tensorflow_fl_model',
+ 'spec': spec
+ }
+
+ return model
diff --git a/examples/datahandlers.py b/examples/datahandlers.py
index e0cccde..e77a9e0 100644
--- a/examples/datahandlers.py
+++ b/examples/datahandlers.py
@@ -11,7 +11,18 @@ def get_datahandler_config(dh_name, folder_data, party_id, is_agg):
for folder in staging_dir_parts:
staging_dir = os.path.join(staging_dir, folder)
- if dh_name == 'mnist':
+ if dh_name == 'custom_dataset' or dh_name == 'custom_dataset_pytorch' or dh_name == 'custom_dataset_tf' \
+ or dh_name == 'custom_dataset_sklearn':
+ data = {
+ 'name': 'MyDataHandler', # the datahandler class provided at runtime
+ 'path': 'custom_data_handler.py',
+ 'info': {
+ }
+ }
+ if is_agg:
+ return None
+
+ elif dh_name == 'mnist':
data = {
'name': 'MnistKerasDataHandler',
'path': 'ibmfl.util.data_handlers.mnist_keras_data_handler',
@@ -110,14 +121,7 @@ def get_datahandler_config(dh_name, folder_data, party_id, is_agg):
}
}
if is_agg:
- if os.path.exists(os.path.join(staging_dir, "datasets", "adult.data")):
- data['info'] = {
- 'txt_file': os.path.join(staging_dir, "datasets", "adult.data")
- }
- else:
- data['info'] = {
- 'txt_file': os.path.join("examples", "datasets", "adult.data")
- }
+ data['info'] = {}
elif dh_name == 'adult_pr':
data = {
@@ -146,14 +150,15 @@ def get_datahandler_config(dh_name, folder_data, party_id, is_agg):
}
}
if is_agg:
- if os.path.exists(os.path.join(staging_dir, "datasets", "adult.data")):
- data['info'] = {
- 'txt_file': os.path.join(staging_dir, "datasets", "adult.data")
- }
- else:
- data['info'] = {
- 'txt_file': os.path.join("examples", "datasets", "adult.data")
- }
+ # if os.path.exists(os.path.join(staging_dir, "datasets", "adult.data")):
+ # data['info'] = {
+ # 'txt_file': os.path.join(staging_dir, "datasets", "adult.data")
+ # }
+ # else:
+ # data['info'] = {
+ # 'txt_file': os.path.join("examples", "datasets", "adult.data")
+ # }
+ return None
elif dh_name == 'adult_sklearn_grw':
data = {
@@ -165,14 +170,7 @@ def get_datahandler_config(dh_name, folder_data, party_id, is_agg):
}
}
if is_agg:
- if os.path.exists(os.path.join(staging_dir, "datasets", "adult.data")):
- data['info'] = {
- 'txt_file': os.path.join(staging_dir, "datasets", "adult.data")
- }
- else:
- data['info'] = {
- 'txt_file': os.path.join("examples", "datasets", "adult.data")
- }
+ return None
elif dh_name == 'nursery':
data = {
@@ -306,13 +304,49 @@ def get_datahandler_config(dh_name, folder_data, party_id, is_agg):
}
}
if is_agg:
- if os.path.exists(os.path.join(staging_dir, "datasets", "cifar10", "all_data")):
+ if os.path.exists(os.path.join(staging_dir, "datasets", "cifar10.npz")):
+ data['info'] = {
+ 'npz_file': os.path.join(staging_dir, "datasets", "cifar10.npz")
+ }
+ else:
+ data['info'] = {
+ 'data_folder': os.path.join("examples", "datasets", "cifar10.npz")
+ }
+
+ elif dh_name == 'cifar10_pytorch':
+ data = {
+ 'name': 'Cifar10PytorchDataHandler',
+ 'path': 'ibmfl.util.data_handlers.cifar10_pytorch_data_handler',
+ 'info': {
+ 'npz_file': os.path.join(folder_data, 'data_party' + str(party_id) + '.npz')
+ }
+ }
+ if is_agg:
+ if os.path.exists(os.path.join(staging_dir, "datasets", "cifar10.npz")):
+ data['info'] = {
+ 'data_folder': os.path.join(staging_dir, "datasets", "cifar10.npz")
+ }
+ else:
+ data['info'] = {
+ 'data_folder': os.path.join("examples", "datasets", "cifar10.npz")
+ }
+
+ elif dh_name == 'cifar10_tf':
+ data = {
+ 'name': 'Cifar10TFDataHandler',
+ 'path': 'ibmfl.util.data_handlers.cifar10_keras_data_handler',
+ 'info': {
+ 'npz_file': os.path.join(folder_data, 'data_party' + str(party_id) + '.npz')
+ }
+ }
+ if is_agg:
+ if os.path.exists(os.path.join(staging_dir, "datasets", "cifar10.npz")):
data['info'] = {
- 'data_folder': os.path.join(staging_dir, "datasets", "cifar10", "all_data")
+ 'data_folder': os.path.join(staging_dir, "datasets", "cifar10.npz")
}
else:
data['info'] = {
- 'data_folder': os.path.join("examples", "datasets", "cifar10", "all_data")
+ 'data_folder': os.path.join("examples", "datasets", "cifar10.npz")
}
elif dh_name == 'compas_sklearn':
@@ -324,13 +358,13 @@ def get_datahandler_config(dh_name, folder_data, party_id, is_agg):
}
}
if is_agg:
- if os.path.exists(os.path.join(staging_dir, "datasets", "compas")):
+ if os.path.exists(os.path.join(staging_dir, "datasets", "compas-scores-two-years.csv")):
data['info'] = {
- 'txt_file': os.path.join(staging_dir, "datasets", "compas")
+ 'txt_file': os.path.join(staging_dir, "datasets", "compas-scores-two-years.csv")
}
else:
data['info'] = {
- 'txt_file': os.path.join("examples", "datasets", "compas")
+ 'txt_file': os.path.join("examples", "datasets", "compas-scores-two-years.csv")
}
elif dh_name == 'compas_pr':
@@ -342,13 +376,13 @@ def get_datahandler_config(dh_name, folder_data, party_id, is_agg):
}
}
if is_agg:
- if os.path.exists(os.path.join(staging_dir, "datasets", "compas")):
+ if os.path.exists(os.path.join(staging_dir, "datasets", "compas-scores-two-years.csv")):
data['info'] = {
- 'txt_file': os.path.join(staging_dir, "datasets", "compas")
+ 'txt_file': os.path.join(staging_dir, "datasets", "compas-scores-two-years.csv")
}
else:
data['info'] = {
- 'txt_file': os.path.join("examples", "datasets", "compas")
+ 'txt_file': os.path.join("examples", "datasets", "compas-scores-two-years.csv")
}
elif dh_name == 'compas_sklearn_grw':
@@ -361,13 +395,13 @@ def get_datahandler_config(dh_name, folder_data, party_id, is_agg):
}
}
if is_agg:
- if os.path.exists(os.path.join(staging_dir, "datasets", "compas")):
+ if os.path.exists(os.path.join(staging_dir, "datasets", "compas-scores-two-years.csv")):
data['info'] = {
- 'txt_file': os.path.join(staging_dir, "datasets", "compas")
+ 'txt_file': os.path.join(staging_dir, "datasets", "compas-scores-two-years.csv")
}
else:
data['info'] = {
- 'txt_file': os.path.join("examples", "datasets", "compas")
+ 'txt_file': os.path.join("examples", "datasets", "compas-scores-two-years.csv")
}
elif dh_name == 'german_sklearn':
diff --git a/examples/fedavg/generate_configs.py b/examples/fedavg/generate_configs.py
index 88d5f4e..91d7831 100644
--- a/examples/fedavg/generate_configs.py
+++ b/examples/fedavg/generate_configs.py
@@ -13,7 +13,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'FedAvgLocalTrainingHandler',
'path': 'ibmfl.party.training.fedavg_local_training_handler'
@@ -42,7 +42,7 @@ def get_hyperparams(model):
def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='keras'):
- SUPPORTED_DATASETS = ['mnist']
+ SUPPORTED_DATASETS = ['mnist', 'custom_dataset']
if dataset in SUPPORTED_DATASETS:
if model not in 'keras':
dataset = dataset + "_" + model
diff --git a/examples/fedavg/model_keras.py b/examples/fedavg/model_keras.py
index 7a40478..ae25c9a 100644
--- a/examples/fedavg/model_keras.py
+++ b/examples/fedavg/model_keras.py
@@ -10,9 +10,6 @@ def get_hyperparams():
local_params = {
'training': {
'epochs': 3
- },
- 'optimizer': {
- 'lr': 0.01
}
}
diff --git a/examples/fedavgplus/README.md b/examples/fedavgplus/README.md
new file mode 100644
index 0000000..adbb91a
--- /dev/null
+++ b/examples/fedavgplus/README.md
@@ -0,0 +1,72 @@
+# Running Federated Averaging Plus (fedavgplus) with TensorFLow and PyTorch
+
+**FedAvg Plus is one variation of Fed+ fusion algorithms proposed here: [Fed+: A Unified Approach to Robust Personalized Federated Learning](https://arxiv.org/pdf/2009.06303.pdf)**
+
+More variations of Fed+ can be at:
+
+1. [Coordinate_median_plus](../coordinate_median_plus)
+2. [Geometric_median_plus](../geometric_median_plus)
+
+
+This example explains how to run Federated Avg plus algorithm on CNNs implemented with TensorFlow training on [MNIST](http://yann.lecun.com/exdb/mnist/) data. Data in this example is preprocessed by scaling down to range from [0, 255] to [0, 1]. No other preprocessing is performed.
+
+## Model Setup
+
+This experiment can be run using models with different underlying framework. By default, configs with keras(tf 1.15)
+based model are generated, but other models like PYTORCH, Scikit Learn, keras(tf 2.1) can be creating by changing -m param.
+
+
+| Model Type | Params |
+|:--------------------------:|:--------: |
+| Pytorch | pytorch |
+| Tensorflow/keras( tf 2.1) | tf |
+
+## Setup FL
+
+
+- Split data by running:
+
+ ```
+ python examples/generate_data.py -n -d mnist -pp
+ ```
+For example, to generate sample data on MNIST dataset, you could run:
+```
+python examples/generate_data.py -n 2 -d mnist -pp 200
+```
+
+Run `python examples/generate_data.py -h` for full descriptions
+of the different options.
+
+- Generate config files by running:
+ ```
+ python examples/generate_configs.py -n -f fedavgplus -m tf -d -p
+ ```
+To run FL, you must have configuration files for the aggregator and for each party.
+
+You can generate these config files using the `generate_configs.py` script.
+
+For example, you could run:
+
+```
+python examples/generate_configs.py -f fedavgplus -m tf -n 2 -d mnist -p examples/data/mnist/random
+```
+
+This command would generate the configs for the `tf_classifier_mnist` model, assuming 2 parties.
+You must also specify the party data path.
+
+Run `python examples/generate_configs.py -h` for full descriptions of the different options.
+
+- In a terminal running an activated IBM FL environment
+(refer to Quickstart in our website to learn more about how to set up the running environment), start the aggregator by running:
+ ```
+ python -m ibmfl.aggregator.aggregator
+ ```
+ Type `START` and press enter to start accepting connections
+- In a terminal running an activated IBM FL environment, start each party by running:
+ ```
+ python -m ibmfl.party.party
+ ```
+ Type `START` and press enter to start accepting connections.
+
+ Type `REGISTER` and press enter to register the party with the aggregator.
+- Finally, start training by entering `TRAIN` in the aggregator terminal.
\ No newline at end of file
diff --git a/examples/fedavgplus/generate_configs.py b/examples/fedavgplus/generate_configs.py
new file mode 100644
index 0000000..5ba76f7
--- /dev/null
+++ b/examples/fedavgplus/generate_configs.py
@@ -0,0 +1,68 @@
+from importlib import import_module
+import examples.datahandlers as datahandlers
+
+
+def get_fusion_config():
+ fusion = {
+ 'name': 'FedAvgFusionHandler',
+ 'path': 'ibmfl.aggregator.fusion.fedavg_fusion_handler'
+ }
+ return fusion
+
+
+def get_local_training_config(configs_folder=None):
+ local_training_handler = {
+ 'name': 'FedAvgFedPlusLocalTrainingHandler',
+ 'path': 'ibmfl.party.training.fedavg_fedplus_local_training_handler'
+ }
+ local_training_handler['info'] = {
+ 'alpha': 0.01,
+ 'rho': 1000
+ }
+ return local_training_handler
+
+
+def get_hyperparams(model):
+ hyperparams = {
+ 'global': {
+ 'rounds': 3,
+ 'termination_accuracy': 0.83,
+ 'max_timeout': 600,
+ 'rho': 1000
+ }
+ }
+ current_module = globals().get('__package__')
+
+ model_module = import_module('{}.model_{}'.format(current_module, model))
+ local_params_method = getattr(model_module, 'get_hyperparams')
+
+ local_params = local_params_method()
+ hyperparams['local'] = local_params
+ return hyperparams
+
+
+def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='tf'):
+ SUPPORTED_DATASETS = ['mnist', 'custom_dataset', 'cifar10']
+ if dataset in SUPPORTED_DATASETS:
+ dataset = dataset + "_" + model
+
+ data = datahandlers.get_datahandler_config(
+ dataset, folder_data, party_id, is_agg)
+ else:
+ raise Exception(
+ "The dataset {} is a wrong combination for fusion/model".format(dataset))
+ return data
+
+
+def get_model_config(folder_configs, dataset, is_agg=False, party_id=0, model='tf'):
+ SUPPORTED_MODELS = ['pytorch', 'tf']
+
+ if model not in SUPPORTED_MODELS:
+ raise Exception("Invalid model config for this fusion algorithm")
+
+ current_module = globals().get('__package__')
+
+ model_module = import_module('{}.model_{}'.format(current_module, model))
+ method = getattr(model_module, 'get_model_config')
+
+ return method(folder_configs, dataset, is_agg=is_agg, party_id=0)
diff --git a/examples/fedavgplus/model_pytorch.py b/examples/fedavgplus/model_pytorch.py
new file mode 100644
index 0000000..28ddcbb
--- /dev/null
+++ b/examples/fedavgplus/model_pytorch.py
@@ -0,0 +1,50 @@
+import os
+import torch
+from torch import nn
+
+
+def get_hyperparams():
+ local_params = {
+ 'training': {
+ 'epochs': 10,
+ 'lr': 0.001,
+ 'batch_size': 32
+ },
+ 'optimizer': 'optim.Adam'
+ }
+
+ return local_params
+
+
+def get_model_config(folder_configs, dataset, is_agg=False, party_id=0):
+ if is_agg:
+ return None
+ model = nn.Sequential(nn.Conv2d(3, 32, 3, 1),
+ nn.ReLU(),
+ nn.Conv2d(32, 64, 3, 1),
+ nn.ReLU(),
+ nn.MaxPool2d(2, 2),
+ nn.Dropout2d(p=0.25),
+ nn.Flatten(),
+ nn.Linear(12544, 128),
+ nn.ReLU(),
+ nn.Dropout2d(p=0.5),
+ nn.Linear(128, 10),
+ nn.LogSoftmax(dim=1)
+ )
+ if not os.path.exists(folder_configs):
+ os.makedirs(folder_configs)
+
+ # Save model
+ fname = os.path.join(folder_configs, 'pytorch_sequence.pt')
+ torch.save(model, fname)
+ spec = {
+ 'model_name': 'pytorch-nn',
+ 'model_definition': fname
+ }
+ model = {
+ 'name': 'PytorchFLModel',
+ 'path': 'ibmfl.model.pytorch_fl_model',
+ 'spec': spec,
+ }
+ return model
diff --git a/examples/fedavgplus/model_tf.py b/examples/fedavgplus/model_tf.py
new file mode 100644
index 0000000..dd42647
--- /dev/null
+++ b/examples/fedavgplus/model_tf.py
@@ -0,0 +1,65 @@
+import os
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.layers import Dense, Flatten, Conv2D
+from tensorflow.keras import Model
+
+
+def get_hyperparams(model='keras'):
+ hyperparams = {
+ 'training': {
+ 'epochs': 10,
+ 'batch_size': 10
+ },
+ 'optimizer': {
+ 'lr': 0.0003
+ }
+ }
+
+ return hyperparams
+
+
+def get_model_config(folder_configs, dataset, is_agg=False, party_id=0):
+ if is_agg:
+ return None
+
+ class MyModel(Model):
+ def __init__(self):
+ super(MyModel, self).__init__()
+ self.conv1 = Conv2D(32, 3, activation='relu')
+ self.flatten = Flatten()
+ self.d1 = Dense(128, activation='relu')
+ self.d2 = Dense(10)
+
+ def call(self, x):
+ x = self.conv1(x)
+ x = self.flatten(x)
+ x = self.d1(x)
+ return self.d2(x)
+
+ # Create an instance of the model
+ model = MyModel()
+ loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
+ from_logits=True)
+ optimizer = tf.keras.optimizers.Adam()
+ acc = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')
+ model.compile(optimizer=optimizer, loss=loss_object, metrics=[acc])
+ img_rows, img_cols = 28, 28
+ input_shape = (None, img_rows, img_cols, 1)
+ model.compute_output_shape(input_shape=input_shape)
+
+ if not os.path.exists(folder_configs):
+ os.makedirs(folder_configs)
+
+ model.save(folder_configs)
+ model.summary()
+ spec = {'model_name': 'tf-cnn',
+ 'model_definition': folder_configs}
+
+ model = {
+ 'name': 'TensorFlowFLModel',
+ 'path': 'ibmfl.model.tensorflow_fl_model',
+ 'spec': spec
+ }
+
+ return model
diff --git a/examples/fedprox/generate_configs.py b/examples/fedprox/generate_configs.py
index da8867e..e092d1b 100644
--- a/examples/fedprox/generate_configs.py
+++ b/examples/fedprox/generate_configs.py
@@ -13,7 +13,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'LocalTrainingHandler',
'path': 'ibmfl.party.training.local_training_handler'
@@ -39,7 +39,7 @@ def get_hyperparams(model='tf'):
def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='tf'):
- SUPPORTED_DATASETS = ['mnist']
+ SUPPORTED_DATASETS = ['mnist', 'custom_dataset']
if dataset in SUPPORTED_DATASETS:
if dataset == 'mnist':
dataset = 'mnist_tf'
diff --git a/examples/generate_configs.py b/examples/generate_configs.py
index 99b4b5b..b141708 100644
--- a/examples/generate_configs.py
+++ b/examples/generate_configs.py
@@ -8,10 +8,6 @@
import sys
from importlib import import_module
-import pycloudmessenger.ffl.abstractions as ffl
-import pycloudmessenger.ffl.fflapi as fflapi
-
-
fl_path = os.path.abspath('.')
if fl_path not in sys.path:
sys.path.append(fl_path)
@@ -19,7 +15,9 @@
from examples.constants import GENERATE_CONFIG_DESC, NUM_PARTIES_DESC, \
PATH_CONFIG_DESC, CONF_PATH, MODEL_CONFIG_DESC, NEW_DESC, NAME_DESC, \
FL_EXAMPLES, FL_CONN_TYPES, CONNECTION_TYPE_DESC, FL_MODELS, \
- FUSION_CONFIG_DESC, TASK_NAME_DESC, EXAMPLES_WARNING
+ FUSION_CONFIG_DESC, TASK_NAME_DESC, CONTEXT_PATH
+
+from examples.constants import FL_CONTEXT
def check_valid_folder_structure(p):
@@ -53,42 +51,17 @@ def setup_parser():
p.add_argument("--config_path", "-conf_path", help=CONF_PATH)
p.add_argument("--model", "-m", help=MODEL_CONFIG_DESC, choices=[os.path.basename(
d) for d in FL_MODELS], required=False, default=None)
- p.add_argument("--fusion", "-f", help=FUSION_CONFIG_DESC ,required=False, choices=[os.path.basename(
+ p.add_argument("--fusion", "-f", help=FUSION_CONFIG_DESC ,required=True, choices=[os.path.basename(
d) for d in FL_EXAMPLES])
p.add_argument("--create_new", "-new", action="store_true", help=NEW_DESC)
p.add_argument("--name", help=NAME_DESC)
p.add_argument("--connection", "-c", choices=[os.path.basename(
d) for d in FL_CONN_TYPES], help=CONNECTION_TYPE_DESC, required=False, default="flask")
p.add_argument("--task_name", "-t", help=TASK_NAME_DESC, required=False)
+ p.add_argument("--context_path", "-context", help=CONTEXT_PATH)
return p
-def rabbit_task(credentials: str, aggregator: str, password: str, task_name: str):
- try:
- ffl.Factory.register(
- 'cloud',
- fflapi.Context,
- fflapi.User,
- fflapi.Aggregator,
- fflapi.Participant
- )
-
- context = ffl.Factory.context(
- 'cloud',
- credentials,
- aggregator,
- password
- )
-
- user = ffl.Factory.user(context)
-
- with user:
- result = user.create_task(task_name, ffl.Topology.star, {})
- print(f"Task '{task_name}' created.")
- except Exception as err:
- print('error: %s', err)
- raise
-
def generate_connection_config(conn_type, party_id=0, is_party=False, task_name = None):
connection = {}
@@ -119,14 +92,8 @@ def generate_connection_config(conn_type, party_id=0, is_party=False, task_name
credentials = yaml.load(credentials)
if 'rabbit' in credentials:
- key = 'rabbit'
- elif 'connection' in credentials:
- key = 'connection'
- else:
- raise Exception("IBMFL_BROKER: environment variable not formatted correctly.")
-
- with open('ibmfl_broker_connection.json', 'w') as creds:
- creds.write(json.dumps(credentials[key]))
+ with open('ibmfl_broker_connection.json', 'w') as creds:
+ creds.write(json.dumps(credentials['rabbit']))
connection = {
'name': 'RabbitMQConnection',
@@ -155,8 +122,6 @@ def generate_connection_config(conn_type, party_id=0, is_party=False, task_name
'task_name': task_name
}
- rabbit_task('ibmfl_broker_connection.json', aggregator, password, task_name)
-
return connection
@@ -181,7 +146,7 @@ def get_privacy():
return privacy
-def generate_ph_config(module, conn_type, is_party=False, party_id=None):
+def generate_ph_config(conn_type, is_party=False):
if is_party:
protocol_handler = {
'name': 'PartyProtocolHandler',
@@ -194,8 +159,6 @@ def generate_ph_config(module, conn_type, is_party=False, party_id=None):
}
if conn_type == 'rabbitmq':
protocol_handler['name'] += 'RabbitMQ'
-
-
return protocol_handler
@@ -219,9 +182,10 @@ def generate_model_config(module, model, folder_configs, dataset, is_agg=False,
return model
-def generate_lt_config(module):
+def generate_lt_config(module, folder_configs=None, party_id=None):
get_local_training_config = getattr(module, 'get_local_training_config')
- return get_local_training_config()
+ lt = get_local_training_config(folder_configs)
+ return lt
def generate_datahandler_config(module, model, party_id, dataset, folder_data, is_agg=False):
@@ -232,8 +196,8 @@ def generate_datahandler_config(module, model, party_id, dataset, folder_data, i
return dh
-def generate_agg_config(module, model, num_parties, conn_type,
- dataset, folder_data, folder_configs, task_name = None):
+def generate_agg_config(module, model, num_parties, conn_type,
+ dataset, folder_data, folder_configs, task_name = None):
if not os.path.exists(folder_configs):
os.makedirs(folder_configs)
@@ -243,10 +207,9 @@ def generate_agg_config(module, model, num_parties, conn_type,
'connection': generate_connection_config(conn_type, task_name=task_name),
'fusion': generate_fusion_config(module),
'hyperparams': generate_hp_config(model, module, num_parties),
+ 'protocol_handler': generate_ph_config(conn_type),
}
- content['protocol_handler'] = generate_ph_config(module, conn_type, is_party=False)
-
model_config = generate_model_config(module, model, folder_configs, dataset, True)
data = generate_datahandler_config(module, model, 0, dataset, folder_data, True)
if model_config:
@@ -268,18 +231,18 @@ def generate_party_config(module, model, num_parties, conn_type,
config_file = os.path.join(
folder_configs, 'config_party' + str(i) + '.yml')
- ph = generate_ph_config(module, conn_type, is_party=True)
-
content = {
'connection': generate_connection_config(conn_type, i, True, task_name=task_name),
'data': generate_datahandler_config(module, model, i, dataset, folder_data),
'model': generate_model_config(module, model, folder_configs, dataset, party_id=i),
- 'protocol_handler': ph,
- 'local_training': generate_lt_config(module),
+ 'protocol_handler': generate_ph_config(conn_type, True),
'aggregator': get_aggregator_info(conn_type),
'privacy': get_privacy()
}
+ content['local_training'] = generate_lt_config(module=module,
+ folder_configs=folder_configs, party_id=i)
+
with open(config_file, 'w') as outfile:
yaml.dump(content, outfile)
@@ -287,7 +250,6 @@ def generate_party_config(module, model, num_parties, conn_type,
os.path.abspath(os.path.join(folder_configs, 'config_party*.yml')))
-
if __name__ == '__main__':
# Parse command line options
parser = setup_parser()
@@ -305,6 +267,7 @@ def generate_party_config(module, model, num_parties, conn_type,
exp_name = args.name
conn_type = args.connection
task_name = args.task_name
+ context_path = args.context_path
# Create folder to save configs
if config_path:
@@ -325,22 +288,22 @@ def generate_party_config(module, model, num_parties, conn_type,
if create_new:
folder_configs = os.path.join(
folder_configs, exp_name if exp_name else str(int(time.time())))
- elif model == 'keras_classifier':
- folder_configs = os.path.join(folder_configs, model)
else:
folder_configs = os.path.join(folder_configs, fusion, model)
- # To support tutorials which still have examples with old format
- if(model == 'keras_classifier'):
- model = 'keras'
- fusion = 'iter_avg'
- print(EXAMPLES_WARNING)
# Import and run generate_configs.py
- config_fusion = import_module('examples.{}.generate_configs'.format(fusion))
-
-
-
- generate_agg_config(config_fusion, model, num_parties, conn_type,
- dataset, party_data_path, folder_configs, task_name)
- generate_party_config(config_fusion, model, num_parties, conn_type,
- dataset, party_data_path, folder_configs, task_name)
+ if context_path is not None and context_path != 'None':
+ context = FL_CONTEXT.get(context_path) or None
+ if context is not None:
+ config_fusion = import_module('{}.{}.generate_configs'.format(context,fusion))
+ else:
+ print('Context path - {} is not correct, please check '.format(context_path))
+ sys.exit(1)
+ else:
+ config_fusion = import_module('examples.{}.generate_configs'.format(fusion))
+ # if crypto feature is enable generate crypto keys
+ # keys = {'keys_agg': None, 'keys_list': None}
+ generate_agg_config(config_fusion, model, num_parties, conn_type,
+ dataset, party_data_path, folder_configs, task_name)
+ generate_party_config(config_fusion, model, num_parties, conn_type,
+ dataset, party_data_path, folder_configs, task_name)
diff --git a/examples/generate_data.py b/examples/generate_data.py
index 180cad0..5dc02ba 100644
--- a/examples/generate_data.py
+++ b/examples/generate_data.py
@@ -5,6 +5,7 @@
import time
import argparse
import numpy as np
+import pandas as pd
fl_path = os.path.abspath('.')
if fl_path not in sys.path:
@@ -27,7 +28,7 @@ def setup_parser():
p = argparse.ArgumentParser(description=GENERATE_DATA_DESC)
p.add_argument("--num_parties", "-n", help=NUM_PARTIES_DESC,
type=int, required=True)
- p.add_argument("--dataset", "-d", choices=FL_DATASETS,
+ p.add_argument("--dataset", "-d",
help=DATASET_DESC, required=True)
p.add_argument("--data_path", "-p", help=PATH_DESC)
p.add_argument("--points_per_party", "-pp", help=PER_PARTY,
@@ -56,7 +57,7 @@ def save_nursery_party_data(nb_dp_per_party, should_stratify, party_folder, data
:type should_stratify: `bool`
:param party_folder: folder to save party data
:type party_folder: `str`
- :param dataset_foler: folder to save dataset
+ :param dataset_folder: folder to save dataset
:type dataset_folder: `str`
"""
if not os.path.exists(dataset_folder):
@@ -102,7 +103,7 @@ def save_adult_party_data(nb_dp_per_party, should_stratify, party_folder, datase
:type should_stratify: `bool`
:param party_folder: folder to save party data
:type party_folder: `str`
- :param dataset_foler: folder to save dataset
+ :param dataset_folder: folder to save dataset
:type dataset_folder: `str`
"""
if not os.path.exists(dataset_folder):
@@ -153,7 +154,7 @@ def save_german_party_data(nb_dp_per_party, should_stratify, party_folder, datas
:type should_stratify: `bool`
:param party_folder: folder to save party data
:type party_folder: `str`
- :param dataset_foler: folder to save dataset
+ :param dataset_folder: folder to save dataset
:type dataset_folder: `str`
"""
if not os.path.exists(dataset_folder):
@@ -198,7 +199,7 @@ def save_compas_party_data(nb_dp_per_party, should_stratify, party_folder, datas
:type should_stratify: `bool``
:param party_folder: folder to save party data
:type party_folder: `str`
- :param dataset_foler: folder to save dataset
+ :param dataset_folder: folder to save dataset
:type dataset_folder: `str`
"""
if not os.path.exists(dataset_folder):
@@ -242,10 +243,12 @@ def save_cifar10_party_data(nb_dp_per_party, should_stratify, party_folder, data
:type should_stratify: `bool`
:param party_folder: folder to save party data
:type party_folder: `str`
- :param dataset_foler: folder to save dataset
+ :param dataset_folder: folder to save dataset
:type dataset_folder: `str`
"""
- (x_train, y_train), (x_test, y_test) = load_cifar10()
+ if not os.path.exists(dataset_folder):
+ os.makedirs(dataset_folder)
+ (x_train, y_train), (x_test, y_test) = load_cifar10(download_dir=dataset_folder)
labels, train_counts = np.unique(y_train, return_counts=True)
te_labels, test_counts = np.unique(y_test, return_counts=True)
if np.all(np.isin(labels, te_labels)):
@@ -306,7 +309,7 @@ def save_mnist_party_data(nb_dp_per_party, should_stratify, party_folder, datase
:type party_folder: `str`
:param dataset_folder: folder to save dataset
:type data_path: `str`
- :param dataset_foler: folder to save dataset
+ :param dataset_folder: folder to save dataset
:type dataset_folder: `str`
"""
if not os.path.exists(dataset_folder):
@@ -371,7 +374,7 @@ def save_higgs_party_data(nb_dp_per_party, should_stratify, party_folder, datase
:type should_stratify: `bool`
:param party_folder: folder to save party data
:type party_folder: `str`
- :param dataset_foler: folder to save dataset
+ :param dataset_folder: folder to save dataset
:type dataset_folder: `str`
"""
if not os.path.exists(dataset_folder):
@@ -419,7 +422,7 @@ def save_airline_party_data(nb_dp_per_party, should_stratify, party_folder, data
:type should_stratify: `bool`
:param party_folder: folder to save party data
:type party_folder: `str`
- :param dataset_foler: folder to save dataset
+ :param dataset_folder: folder to save dataset
:type dataset_folder: `str`
"""
if not os.path.exists(dataset_folder):
@@ -482,7 +485,7 @@ def save_diabetes_party_data(nb_dp_per_party, should_stratify, party_folder, dat
:type should_stratify: `bool`
:param party_folder: folder to save party data
:type party_folder: `str`
- :param dataset_foler: folder to save dataset
+ :param dataset_folder: folder to save dataset
:type dataset_folder: `str`
"""
if not os.path.exists(dataset_folder):
@@ -536,7 +539,7 @@ def save_binovf_party_data(nb_dp_per_party, should_stratify, party_folder, datas
:type should_stratify: `bool`
:param party_folder: folder to save party data
:type party_folder: `str`
- :param dataset_foler: folder to save dataset
+ :param dataset_folder: folder to save dataset
:type dataset_folder: `str`
"""
if not os.path.exists(dataset_folder):
@@ -584,7 +587,7 @@ def save_multovf_party_data(nb_dp_per_party, should_stratify, party_folder, data
:type should_stratify: `bool`
:param party_folder: folder to save party data
:type party_folder: `str`
- :param dataset_foler: folder to save dataset
+ :param dataset_folder: folder to save dataset
:type dataset_folder: `str`
"""
if not os.path.exists(dataset_folder):
@@ -636,7 +639,7 @@ def save_linovf_party_data(nb_dp_per_party, party_folder, dataset_folder):
:type nb_dp_per_party: `list[int]`
:param party_folder: folder to save party data
:type party_folder: `str`
- :param dataset_foler: folder to save dataset
+ :param dataset_folder: folder to save dataset
:type dataset_folder: `str`
"""
if not os.path.exists(dataset_folder):
@@ -675,7 +678,7 @@ def save_femnist_party_data(nb_dp_per_party, should_stratify, party_folder, data
:type party_folder: `str`
:return: None
:rtype: None
- :param dataset_foler: folder to save dataset
+ :param dataset_folder: folder to save dataset
:type dataset_folder: `str`
"""
dataset_folder = os.path.join(dataset_folder, "femnist")
@@ -794,6 +797,69 @@ def save_federated_clustering_data(nb_dp_per_party, party_folder):
print('Finished! :) Data saved in ', party_folder)
+def save_party_data(nb_dp_per_party, should_stratify, party_folder, dataset_folder, dataset):
+ """
+ Loads a generate dataset saved as in csv format and creates parties local datasets
+ as specified.
+
+ :param nb_dp_per_party: the number of data points each party should have
+ :type nb_dp_per_party: `list[int]`
+ :param should_stratify: True if data should be assigned proportional to source class distributions
+ :type should_stratify: `bool`
+ :param party_folder: folder to save party data
+ :type party_folder: `str`
+ :param dataset_folder: folder to save dataset
+ :type dataset_folder: `str`
+ :param dataset: the name of the csv file
+ :type dataset: `str`
+ """
+ dataset_folder = os.path.join(dataset_folder, dataset) + '.csv'
+ print("Loading the original dataset from: " + dataset_folder)
+
+ try:
+ # if no header
+ data = pd.read_csv(dataset_folder, header=None).to_numpy()
+ X, y = data[:, :-1], data[:, -1].astype('int')
+ except Exception as ex:
+ print(ex)
+ print("Warning: please ensure the provided dataset is in .csv format.")
+ print("Please ensure that the class labels are provided in the last column.")
+ print("Warning: please ensure that the class labels are provided as numbers.")
+ print("Loading the dataset assuming the header is provided in the 1st column.")
+ data = pd.read_csv(dataset_folder, header=1).to_numpy()
+ X, y = data[:, :-1], data[:, -1].astype('int')
+
+ num_train = len(X)
+ labels, counts = np.unique(y, return_counts=True)
+
+ if should_stratify:
+ probs = {label: counts[np.where(labels == label)[
+ 0][0]] / float(num_train) for label in labels}
+ else:
+ probs = {label: 1.0 / num_train for label in labels}
+
+ for i, dp in enumerate(nb_dp_per_party):
+
+ # Regular Dataset
+ p_list = np.array([probs[y[idx]] for idx in range(num_train)])
+ p_list /= np.sum(p_list)
+
+ indices = np.random.choice(num_train, dp, p=p_list)
+ indices = indices.tolist()
+
+ # Use indices for data/classification subset
+ x_part = [','.join(item) for item in X[indices, :].astype(str)]
+ y_part = y[indices]
+
+ # Write to File
+ name_file = 'data_party' + str(i) + '.csv'
+ name_file = os.path.join(party_folder, name_file)
+ out = open(name_file, 'w')
+ for i in range(len(x_part)):
+ out.write(x_part[i]+','+str(int(y_part[i]))+'\n')
+ out.close()
+
+ print('Finished! :) Data saved in', party_folder)
if __name__ == '__main__':
# Parse command line options
@@ -878,3 +944,6 @@ def save_federated_clustering_data(nb_dp_per_party, party_folder):
save_femnist_party_data(points_per_party, stratify, folder_party_data, folder_dataset)
elif dataset == 'cifar10':
save_cifar10_party_data(points_per_party, stratify, folder_party_data, folder_dataset)
+ else:
+ print("Loading a non-default dataset, redircting to general data split method...")
+ save_party_data(points_per_party, stratify, folder_party_data, folder_dataset, dataset)
diff --git a/examples/geometric_median_plus/README.md b/examples/geometric_median_plus/README.md
new file mode 100644
index 0000000..349de7f
--- /dev/null
+++ b/examples/geometric_median_plus/README.md
@@ -0,0 +1,62 @@
+# Running Geometric Median Plus (geometric_median_plus) with TensorFLow
+
+**Geometric Median Plus is one variation of Fed+ fusion algorithms proposed here: [Fed+: A Unified Approach to Robust Personalized Federated Learning](https://arxiv.org/pdf/2009.06303.pdf)**
+
+
+More variations of Fed+ can be at:
+
+1. [Fedavg_plus](../fedavgplus)
+2. [Geometric_median_plus](../geometric_median_plus)
+
+This example explains how to run Geometric Median plus algorithm on CNNs implemented with TensorFlow training on
+[MNIST](http://yann.lecun.com/exdb/mnist/) data. Data in this example is preprocessed by scaling down to range from `[0, 255]` to `[0, 1]`.
+No other preprocessing is performed.
+
+## Setup FL
+
+- Split data by running:
+
+ ```
+ python examples/generate_data.py -n -d mnist -pp
+ ```
+For example, to generate sample data on MNIST dataset, you could run:
+```
+python examples/generate_data.py -n 2 -d mnist -pp 200
+```
+
+Run `python examples/generate_data.py -h` for full descriptions
+of the different options.
+
+- Generate config files by running:
+ ```
+ python examples/generate_configs.py -n -f geometric_median_plus -m tf -d -p
+ ```
+To run FL, you must have configuration files for the aggregator and for each party.
+
+You can generate these config files using the `generate_configs.py` script.
+
+For example, you could run:
+
+```
+python examples/generate_configs.py -f geometric_median_plus -m tf -n 2 -d mnist -p examples/data/mnist/random
+```
+
+This command would generate the configs for the `tf_classifier_mnist` model, assuming 2 parties.
+You must also specify the party data path.
+
+Run `python examples/generate_configs.py -h` for full descriptions of the different options.
+
+- In a terminal running an activated IBM FL environment
+(refer to Quickstart in our website to learn more about how to set up the running environment), start the aggregator by running:
+ ```
+ python -m ibmfl.aggregator.aggregator
+ ```
+ Type `START` and press enter to start accepting connections
+- In a terminal running an activated IBM FL environment, start each party by running:
+ ```
+ python -m ibmfl.party.party
+ ```
+ Type `START` and press enter to start accepting connections.
+
+ Type `REGISTER` and press enter to register the party with the aggregator.
+- Finally, start training by entering `TRAIN` in the aggregator terminal.
\ No newline at end of file
diff --git a/examples/geometric_median_plus/generate_configs.py b/examples/geometric_median_plus/generate_configs.py
new file mode 100644
index 0000000..1d3ceef
--- /dev/null
+++ b/examples/geometric_median_plus/generate_configs.py
@@ -0,0 +1,111 @@
+import os
+
+import tensorflow as tf
+from tensorflow.keras.layers import Dense, Flatten, Conv2D
+from tensorflow.keras import Model
+
+import examples.datahandlers as datahandlers
+
+
+def get_fusion_config():
+ fusion = {
+ 'name': 'GeometricMedianFedplusFusionHandler',
+ 'path': 'ibmfl.aggregator.fusion.geometric_median_fedplus_fusion_handler'
+ }
+ return fusion
+
+
+def get_local_training_config(configs_folder=None):
+ local_training_handler = {
+ 'name': 'GeometricMedianFedPlusLocalTrainingHandler',
+ 'path': 'ibmfl.party.training.geometric_median_fedplus_local_training_handler'
+ }
+ local_training_handler['info'] = {
+ 'alpha': 0.01,
+ 'rho': 10
+ }
+ return local_training_handler
+
+
+def get_hyperparams(model='keras'):
+ hyperparams = {
+ 'global': {
+ 'rounds': 3,
+ 'termination_accuracy': 0.83,
+ 'max_timeout': 600,
+ 'rho': 10
+ },
+ 'local': {
+ 'training': {
+ 'epochs': 10,
+ 'batch_size': 10
+ },
+ 'optimizer': {
+ 'lr': 0.0003
+ }
+ }
+ }
+
+ return hyperparams
+
+
+def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='tf'):
+ SUPPORTED_DATASETS = ['mnist', 'custom_dataset']
+ if dataset in SUPPORTED_DATASETS:
+ dataset = dataset + "_" + model
+
+ data = datahandlers.get_datahandler_config(
+ dataset, folder_data, party_id, is_agg)
+ else:
+ raise Exception(
+ "The dataset {} is a wrong combination for fusion/model".format(dataset))
+ return data
+
+
+def get_model_config(folder_configs, dataset, is_agg=False, party_id=0, model='tf'):
+ if is_agg:
+ return None
+
+ if model is None or model is 'default':
+ model = 'tf'
+
+ class MyModel(Model):
+ def __init__(self):
+ super(MyModel, self).__init__()
+ self.conv1 = Conv2D(32, 3, activation='relu')
+ self.flatten = Flatten()
+ self.d1 = Dense(128, activation='relu')
+ self.d2 = Dense(10)
+
+ def call(self, x):
+ x = self.conv1(x)
+ x = self.flatten(x)
+ x = self.d1(x)
+ return self.d2(x)
+
+ # Create an instance of the model
+ model = MyModel()
+ loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
+ from_logits=True)
+ optimizer = tf.keras.optimizers.Adam()
+ acc = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')
+ model.compile(optimizer=optimizer, loss=loss_object, metrics=[acc])
+ img_rows, img_cols = 28, 28
+ input_shape = (None, img_rows, img_cols, 1)
+ model.compute_output_shape(input_shape=input_shape)
+
+ if not os.path.exists(folder_configs):
+ os.makedirs(folder_configs)
+
+ model.save(folder_configs)
+
+ spec = {'model_name': 'tf-cnn',
+ 'model_definition': folder_configs}
+
+ model = {
+ 'name': 'TensorFlowFLModel',
+ 'path': 'ibmfl.model.tensorflow_fl_model',
+ 'spec': spec
+ }
+
+ return model
diff --git a/examples/gradient_aggregation/generate_configs.py b/examples/gradient_aggregation/generate_configs.py
index 1b0386b..ca92259 100644
--- a/examples/gradient_aggregation/generate_configs.py
+++ b/examples/gradient_aggregation/generate_configs.py
@@ -13,7 +13,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'GradientLocalTrainingHandler',
'path': 'ibmfl.party.training.gradient_local_training_handler'
@@ -42,7 +42,7 @@ def get_hyperparams(model):
def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='keras'):
- SUPPORTED_DATASETS = ['mnist']
+ SUPPORTED_DATASETS = ['mnist', 'custom_dataset']
if dataset in SUPPORTED_DATASETS:
if model not in 'keras':
dataset = dataset + "_" + model
diff --git a/examples/gradient_aggregation/model_keras.py b/examples/gradient_aggregation/model_keras.py
index 9377b37..39d7214 100644
--- a/examples/gradient_aggregation/model_keras.py
+++ b/examples/gradient_aggregation/model_keras.py
@@ -10,9 +10,6 @@ def get_hyperparams():
local_params = {
'training': {
'epochs': 3
- },
- 'optimizer': {
- 'lr': 0.01
}
}
diff --git a/examples/id3_dt/README.md b/examples/id3_dt/README.md
index 5e27b95..f3d727f 100644
--- a/examples/id3_dt/README.md
+++ b/examples/id3_dt/README.md
@@ -75,7 +75,7 @@ The following preprocessing was performed before training:
```
- Generate config files by running:
```
- python examples/generate_configs.py -n -f id3_dt -d adult -p examples/data/adult/random
+ python examples/generate_configs.py -n 3 -f id3_dt -d adult -p examples/data/adult/random
```
- In a terminal running an activated IBM FL environment
(refer to Quickstart in our website to learn more about how to set up the running environment), start the aggregator by running:
diff --git a/examples/id3_dt/generate_configs.py b/examples/id3_dt/generate_configs.py
index fa35d80..ce2814a 100644
--- a/examples/id3_dt/generate_configs.py
+++ b/examples/id3_dt/generate_configs.py
@@ -15,7 +15,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'LocalTrainingHandler',
'path': 'ibmfl.party.training.local_training_handler'
@@ -35,7 +35,7 @@ def get_hyperparams(model=None):
def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model=None):
- SUPPORTED_DATASETS = ['adult', 'nursery']
+ SUPPORTED_DATASETS = ['adult', 'nursery', 'custom_dataset']
if dataset in SUPPORTED_DATASETS:
data = datahandlers.get_datahandler_config(
dataset, folder_data, party_id, is_agg)
diff --git a/examples/iter_avg/generate_configs.py b/examples/iter_avg/generate_configs.py
index 105e528..72c280e 100644
--- a/examples/iter_avg/generate_configs.py
+++ b/examples/iter_avg/generate_configs.py
@@ -13,7 +13,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'LocalTrainingHandler',
'path': 'ibmfl.party.training.local_training_handler'
@@ -42,7 +42,7 @@ def get_hyperparams(model):
def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='keras'):
- SUPPORTED_DATASETS = ['mnist', 'adult', 'cifar10', 'femnist']
+ SUPPORTED_DATASETS = ['mnist', 'adult', 'cifar10', 'femnist', 'custom_dataset']
if dataset in SUPPORTED_DATASETS:
if model not in 'keras':
dataset = dataset + "_" + model
diff --git a/examples/iter_avg/model_keras.py b/examples/iter_avg/model_keras.py
index 042c339..47b04a6 100644
--- a/examples/iter_avg/model_keras.py
+++ b/examples/iter_avg/model_keras.py
@@ -10,9 +10,6 @@ def get_hyperparams():
local_params = {
'training': {
'epochs': 3
- },
- 'optimizer': {
- 'lr': 0.01
}
}
@@ -31,6 +28,10 @@ def get_model_config(folder_configs, dataset, is_agg=False, party_id=0):
elif dataset == 'femnist':
return get_femnist_model_config(folder_configs)
+
+ elif dataset == 'custom_dataset':
+ print('Using the same model as with MNIST, provide model file to replace if needed.')
+ return get_mnist_model_config(folder_configs)
else:
raise Exception(
"The dataset {} is a wrong combination for fusion/model".format(dataset))
diff --git a/examples/krum/generate_configs.py b/examples/krum/generate_configs.py
index 4cb246e..6d84c98 100644
--- a/examples/krum/generate_configs.py
+++ b/examples/krum/generate_configs.py
@@ -15,7 +15,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'LocalTrainingHandler',
'path': 'ibmfl.party.training.local_training_handler'
@@ -43,7 +43,7 @@ def get_hyperparams(model):
def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='keras'):
- SUPPORTED_DATASETS = ['mnist']
+ SUPPORTED_DATASETS = ['mnist', 'adult', 'custom_dataset']
if dataset in SUPPORTED_DATASETS:
if model not in 'keras':
dataset = dataset + "_" + model
diff --git a/examples/krum/model_keras.py b/examples/krum/model_keras.py
index 7a40478..ae25c9a 100644
--- a/examples/krum/model_keras.py
+++ b/examples/krum/model_keras.py
@@ -10,9 +10,6 @@ def get_hyperparams():
local_params = {
'training': {
'epochs': 3
- },
- 'optimizer': {
- 'lr': 0.01
}
}
diff --git a/examples/naive_bayes_dp/generate_configs.py b/examples/naive_bayes_dp/generate_configs.py
index 0f33ddb..bb83f1d 100644
--- a/examples/naive_bayes_dp/generate_configs.py
+++ b/examples/naive_bayes_dp/generate_configs.py
@@ -15,7 +15,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'LocalTrainingHandler',
'path': 'ibmfl.party.training.local_training_handler'
diff --git a/examples/pfnm/generate_configs.py b/examples/pfnm/generate_configs.py
index bc09b15..2e19269 100644
--- a/examples/pfnm/generate_configs.py
+++ b/examples/pfnm/generate_configs.py
@@ -14,7 +14,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'PFNMLocalTrainingHandler',
'path': 'ibmfl.party.training.pfnm_local_training_handler'
@@ -43,7 +43,7 @@ def get_hyperparams(model):
def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='keras'):
- SUPPORTED_DATASETS = ['mnist']
+ SUPPORTED_DATASETS = ['mnist', 'custom_dataset']
if dataset in SUPPORTED_DATASETS:
if model not in 'keras':
dataset = dataset + "_" + model
diff --git a/examples/pfnm/model_keras.py b/examples/pfnm/model_keras.py
index 2fc9a20..feafbcc 100644
--- a/examples/pfnm/model_keras.py
+++ b/examples/pfnm/model_keras.py
@@ -10,9 +10,6 @@ def get_hyperparams():
local_params = {
'training': {
'epochs': 3
- },
- 'optimizer': {
- 'lr': 0.01
}
}
diff --git a/examples/prej_remover/README.md b/examples/prej_remover/README.md
index 9ea7c26..8a15203 100644
--- a/examples/prej_remover/README.md
+++ b/examples/prej_remover/README.md
@@ -7,7 +7,8 @@ Currently, for Federated Prejudice Removal we support following datasets:
# Running Federated Prejudice Removal in FL
-This example provides the implementation for Federated Prejudice Removal presented in [Mitigating Bias in Federated Learning](https://arxiv.org/abs/2012.02447).
+This example explains how to run Federated Prejudice Removal, a federated learning implementation of the [Kamishima Algorithm](https://github.com/algofairness/fairness-comparison/tree/master/fairness/algorithms/kamishima). We use this algorithm in our paper on bias mitigation and federated learning; see it [here]()https://arxiv.org/abs/2012.02447).
+
This example requires the `gensim` library. Run the following:
```
pip install gensim
diff --git a/examples/prej_remover/generate_configs.py b/examples/prej_remover/generate_configs.py
index e94f82c..022d3a8 100644
--- a/examples/prej_remover/generate_configs.py
+++ b/examples/prej_remover/generate_configs.py
@@ -8,7 +8,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'PRLocalTrainingHandler',
'path': 'ibmfl.party.training.pr_local_training_handler'
@@ -33,7 +33,7 @@ def get_hyperparams(model):
def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='None'):
- SUPPORTED_DATASETS = ['adult', 'compas']
+ SUPPORTED_DATASETS = ['adult', 'compas', 'custom_dataset']
if dataset in SUPPORTED_DATASETS:
if dataset == 'adult':
diff --git a/examples/rl_cartpole/generate_configs.py b/examples/rl_cartpole/generate_configs.py
index 5e44a4c..bf55fa1 100644
--- a/examples/rl_cartpole/generate_configs.py
+++ b/examples/rl_cartpole/generate_configs.py
@@ -13,7 +13,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'RLLocalTrainingHandler',
'path': 'ibmfl.party.training.rl_local_training_handler'
diff --git a/examples/rl_pendulum/generate_configs.py b/examples/rl_pendulum/generate_configs.py
index d0ddbdd..473a2f5 100644
--- a/examples/rl_pendulum/generate_configs.py
+++ b/examples/rl_pendulum/generate_configs.py
@@ -12,7 +12,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'RLLocalTrainingHandler',
'path': 'ibmfl.party.training.rl_local_training_handler'
diff --git a/examples/shuffle_iter_avg/README.md b/examples/shuffle_iter_avg/README.md
new file mode 100644
index 0000000..dba9a34
--- /dev/null
+++ b/examples/shuffle_iter_avg/README.md
@@ -0,0 +1,58 @@
+# Running shuffled iterative averaging aggregation with Keras CNN in IBM FL library
+
+This example explains how to run federated learning using [shuffled iterative averaging aggregation](https://arxiv.org/pdf/2105.09400.pdf) on CNNs implemented with Keras training on
+[MNIST](http://yann.lecun.com/exdb/mnist/) data. Data in this example is preprocessed by scaling down to range from `[0, 255]` to `[0, 1]`.
+No other preprocessing is performed.
+
+## Model Setup
+
+This experiment can be run using models with different underlying framework. By default, configs with keras(tf 1.15) based model are generated, but other models like PYTORCH, Scikit Learn, keras(tf 2.1) can be created by changing -m param.
+
+
+| Model Type | Params |
+|:---------------------------:|:---------:|
+| Keras (with tf 1.15) | keras |
+| Pytorch | pytorch |
+| Scikit Learn | sklearn |
+| Tensorflow/keras( tf 2.1) | tf |
+
+
+## Dataset Setup
+Shuffled Iterative Avg fusion can be run on different datasets by just changing -d param while generating config. Model definition changes as dataset changes, we currently only support below shown combinations.
+
+| Dataset | Params | Keras | Pytorch | TF | sklearn |
+|:------------------:|:--------: |:--------:|:--------:|:--------:|:--------:|
+| MNIST | mnist | YES | YES | YES | YES |
+| Adult Dataset | adult | NO | NO | NO | YES |
+| Cifar-10 | cifar10 | YES | NO | NO | NO |
+| FEMNIST | femnist | YES | NO | NO | NO |
+
+
+- Split data by running:
+ ```
+ python examples/generate_data.py -n -d -pp
+ ```
+- Generate config files by running:
+ ```
+ python examples/generate_configs.py -f shuffle_iter_avg -m keras -n -d -p
+ ```
+- Specify file path of permutation seed in the party_config:
+ ```
+ local_training:
+ info:
+ permute_secret:
+ ```
+- In a terminal running an activated IBM FL environment
+(refer to Quickstart in our website to learn more about how to set up the running environment), start the aggregator by running:
+ ```
+ python -m ibmfl.aggregator.aggregator
+ ```
+ Type `START` and press enter to start accepting connections
+- In a terminal running an activated IBM FL environment, start each party by running:
+ ```
+ python -m ibmfl.party.party
+ ```
+ Type `START` and press enter to start accepting connections.
+
+ Type `REGISTER` and press enter to register the party with the aggregator.
+- Finally, start training by entering `TRAIN` in the aggregator terminal.
\ No newline at end of file
diff --git a/examples/shuffle_iter_avg/generate_configs.py b/examples/shuffle_iter_avg/generate_configs.py
new file mode 100644
index 0000000..c0fc779
--- /dev/null
+++ b/examples/shuffle_iter_avg/generate_configs.py
@@ -0,0 +1,82 @@
+import os
+import random
+import sys
+from importlib import import_module
+
+import examples.datahandlers as datahandlers
+_g_seed = None
+
+def get_fusion_config():
+ fusion = {
+ 'name': 'ShuffleIterAvgFusionHandler',
+ 'path': 'ibmfl.aggregator.fusion.shuffle_iter_avg_fusion_handler'
+ }
+ return fusion
+
+
+def get_local_training_config(configs_folder=None):
+ global _g_seed
+ # default seed file
+ seed_file = os.path.join(configs_folder, 'permute_secret.seed')
+
+ if not _g_seed:
+ _g_seed = random.randrange(sys.maxsize)
+ with open(seed_file, 'w') as outfile:
+ outfile.write(str(_g_seed))
+
+ local_training_handler = {
+ 'name': 'ShuffleLocalTrainingHandler',
+ 'path': 'ibmfl.party.training.shuffle_local_training_handler',
+ 'info': {
+ 'permute_secret': seed_file
+ }
+ }
+ return local_training_handler
+
+
+def get_hyperparams(model):
+ hyperparams = {
+ 'global': {
+ 'rounds': 3,
+ 'termination_accuracy': 0.9,
+ 'max_timeout': 60
+ }
+ }
+ current_module = globals().get('__package__')
+
+ model_module = import_module('{}.model_{}'.format(current_module, model))
+ local_params_method = getattr(model_module, 'get_hyperparams')
+
+ local_params = local_params_method()
+ hyperparams['local'] = local_params
+
+ return hyperparams
+
+
+def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='keras'):
+ SUPPORTED_DATASETS = ['mnist', 'adult', 'cifar10', 'femnist', 'custom_dataset']
+ if dataset in SUPPORTED_DATASETS:
+ if model not in 'keras':
+ dataset = dataset + "_" + model
+
+ data = datahandlers.get_datahandler_config(
+ dataset, folder_data, party_id, is_agg)
+ else:
+ raise Exception(
+ "The dataset {} is a wrong combination for fusion/model".format(dataset))
+ return data
+
+
+def get_model_config(folder_configs, dataset, is_agg=False, party_id=0, model='keras'):
+ SUPPORTED_MODELS = ['keras', 'pytorch', 'tf', 'sklearn']
+
+ if model not in SUPPORTED_MODELS:
+ raise Exception("Invalid model config for this fusion algorithm")
+
+ current_module = globals().get('__package__')
+
+ model_module = import_module('{}.model_{}'.format(current_module, model))
+ method = getattr(model_module, 'get_model_config')
+
+ return method(folder_configs, dataset, is_agg=is_agg, party_id=0)
+
diff --git a/examples/shuffle_iter_avg/model_keras.py b/examples/shuffle_iter_avg/model_keras.py
new file mode 100644
index 0000000..027873a
--- /dev/null
+++ b/examples/shuffle_iter_avg/model_keras.py
@@ -0,0 +1,175 @@
+import os
+import keras
+from keras import backend as K
+from keras.layers import Conv2D, MaxPooling2D
+from keras.layers import Dense, Dropout, Flatten
+from keras.models import Sequential
+
+
+def get_hyperparams():
+ local_params = {
+ 'training': {
+ 'epochs': 3
+ }
+ }
+
+ return local_params
+
+def get_model_config(folder_configs, dataset, is_agg=False, party_id=0):
+
+ if is_agg:
+ return None
+
+ if dataset == 'mnist':
+ return get_mnist_model_config(folder_configs)
+
+ elif dataset == 'cifar10':
+ return get_cifar10_model_config(folder_configs)
+
+ elif dataset == 'femnist':
+ return get_femnist_model_config(folder_configs)
+ else:
+ raise Exception(
+ "The dataset {} is a wrong combination for fusion/model".format(dataset))
+
+
+def get_mnist_model_config(folder_configs):
+
+
+ num_classes = 10
+ img_rows, img_cols = 28, 28
+ if K.image_data_format() == 'channels_first':
+ input_shape = (1, img_rows, img_cols)
+ else:
+ input_shape = (img_rows, img_cols, 1)
+
+ model = Sequential()
+ model.add(Conv2D(32, kernel_size=(3, 3),
+ activation='relu',
+ input_shape=input_shape))
+ model.add(Conv2D(64, (3, 3), activation='relu'))
+ model.add(MaxPooling2D(pool_size=(2, 2)))
+ model.add(Dropout(0.25))
+ model.add(Flatten())
+ model.add(Dense(128, activation='relu'))
+ model.add(Dropout(0.5))
+ model.add(Dense(num_classes, activation='softmax'))
+
+ model.compile(loss=keras.losses.categorical_crossentropy,
+ optimizer=keras.optimizers.Adadelta(),
+ metrics=['accuracy'])
+
+ if not os.path.exists(folder_configs):
+ os.makedirs(folder_configs)
+
+ # Save model
+ fname = os.path.join(folder_configs, 'compiled_keras.h5')
+ model.save(fname)
+
+ K.clear_session()
+ # Generate model spec:
+ spec = {
+ 'model_name': 'keras-cnn',
+ 'model_definition': fname
+ }
+
+ model = {
+ 'name': 'KerasFLModel',
+ 'path': 'ibmfl.model.keras_fl_model',
+ 'spec': spec
+ }
+
+ return model
+
+
+def get_cifar10_model_config(folder_configs):
+
+ num_classes = 10
+ img_rows, img_cols = 32, 32
+ if K.image_data_format() == 'channels_first':
+ input_shape = (3, img_rows, img_cols)
+ else:
+ input_shape = (img_rows, img_cols, 3)
+
+ model = Sequential()
+ model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
+ model.add(MaxPooling2D(pool_size=(2, 2)))
+ model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
+ model.add(MaxPooling2D(pool_size=(2, 2)))
+ model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
+ model.add(MaxPooling2D(pool_size=(2, 2)))
+ model.add(Flatten())
+ model.add(Dense(256, activation='relu'))
+ model.add(Dense(128, activation='relu'))
+ model.add(Dense(num_classes, activation='softmax'))
+
+ model.compile(loss=keras.losses.categorical_crossentropy,
+ optimizer=keras.optimizers.Adam(),
+ metrics=['accuracy'])
+ if not os.path.exists(folder_configs):
+ os.makedirs(folder_configs)
+
+ # Save model
+ fname = os.path.join(folder_configs, 'compiled_cifar10_keras.h5')
+ model.save(fname)
+
+ K.clear_session()
+ # Generate model spec:
+ spec = {
+ 'model_name': 'keras-cnn-cifar10',
+ 'model_definition': fname
+ }
+
+ model = {
+ 'name': 'KerasFLModel',
+ 'path': 'ibmfl.model.keras_fl_model',
+ 'spec': spec
+ }
+
+ return model
+
+def get_femnist_model_config(folder_configs):
+
+ num_classes = 62
+ img_rows, img_cols = 28, 28
+ if K.image_data_format() == 'channels_first':
+ input_shape = (1, img_rows, img_cols)
+ else:
+ input_shape = (img_rows, img_cols, 1)
+
+ model = Sequential()
+ model.add(Conv2D(32, kernel_size=(5, 5),
+ activation='relu', padding='same',
+ input_shape=input_shape))
+ model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
+
+ model.add(Conv2D(64, (5, 5), activation='relu', padding='same',))
+ model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
+ model.add(Flatten())
+ model.add(Dense(2048, activation='relu'))
+ model.add(Dense(num_classes, activation='softmax'))
+
+ model.compile(loss=keras.losses.categorical_crossentropy,
+ optimizer=keras.optimizers.SGD(),
+ metrics=['accuracy'])
+ if not os.path.exists(folder_configs):
+ os.makedirs(folder_configs)
+
+ # Save model
+ fname = os.path.join(folder_configs, 'compiled_femnist_keras.h5')
+ model.save(fname)
+
+ K.clear_session()
+ # Generate model spec:
+ spec = {
+ 'model_name': 'keras-cnn',
+ 'model_definition': fname
+ }
+
+ model = {
+ 'name': 'KerasFLModel',
+ 'path': 'ibmfl.model.keras_fl_model',
+ 'spec': spec
+ }
+
+ return model
diff --git a/examples/shuffle_iter_avg/model_pytorch.py b/examples/shuffle_iter_avg/model_pytorch.py
new file mode 100644
index 0000000..e7b3009
--- /dev/null
+++ b/examples/shuffle_iter_avg/model_pytorch.py
@@ -0,0 +1,50 @@
+import os
+import torch
+from torch import nn
+
+
+def get_hyperparams():
+ local_params = {
+ 'training': {
+ 'epochs': 3,
+ 'lr': 1
+ },
+ 'optimizer': 'optim.Adadelta'
+ }
+
+ return local_params
+
+
+def get_model_config(folder_configs, dataset, is_agg=False, party_id=0):
+
+ if is_agg:
+ return None
+ model = nn.Sequential(nn.Conv2d(1, 32, 3, 1),
+ nn.ReLU(),
+ nn.Conv2d(32, 64, 3, 1),
+ nn.ReLU(),
+ nn.MaxPool2d(2, 2),
+ nn.Dropout2d(p=0.25),
+ nn.Flatten(),
+ nn.Linear(9216, 128),
+ nn.ReLU(),
+ nn.Dropout2d(p=0.5),
+ nn.Linear(128, 10),
+ nn.LogSoftmax(dim=1)
+ )
+ if not os.path.exists(folder_configs):
+ os.makedirs(folder_configs)
+
+ # Save model
+ fname = os.path.join(folder_configs, 'pytorch_sequence.pt')
+ torch.save(model, fname)
+ spec = {
+ 'model_name': 'pytorch-nn',
+ 'model_definition': fname
+ }
+ model = {
+ 'name': 'PytorchFLModel',
+ 'path': 'ibmfl.model.pytorch_fl_model',
+ 'spec': spec,
+ }
+ return model
diff --git a/examples/shuffle_iter_avg/model_sklearn.py b/examples/shuffle_iter_avg/model_sklearn.py
new file mode 100644
index 0000000..1481ae3
--- /dev/null
+++ b/examples/shuffle_iter_avg/model_sklearn.py
@@ -0,0 +1,45 @@
+import os
+import joblib
+import numpy as np
+from sklearn.linear_model import SGDClassifier
+
+
+def get_hyperparams():
+ local_params = {
+ 'training': {
+ 'max_iter': 2
+ }
+ }
+ return local_params
+
+
+def get_model_config(folder_configs, dataset, is_agg=False, party_id=0):
+ if is_agg:
+ return None
+
+ model = SGDClassifier(loss='log', penalty='l2')
+ if dataset == 'adult':
+ model.classes_ = np.array([0, 1])
+ elif dataset == 'mnist':
+ model.classes_ = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
+
+ if not os.path.exists(folder_configs):
+ os.makedirs(folder_configs)
+
+ fname = os.path.join(folder_configs, 'model_architecture.pickle')
+
+ with open(fname, 'wb') as f:
+ joblib.dump(model, f)
+
+ # Generate model spec:
+ spec = {
+ 'model_definition': fname
+ }
+
+ model = {
+ 'name': 'SklearnSGDFLModel',
+ 'path': 'ibmfl.model.sklearn_SGD_linear_fl_model',
+ 'spec': spec
+ }
+
+ return model
\ No newline at end of file
diff --git a/examples/shuffle_iter_avg/model_tf.py b/examples/shuffle_iter_avg/model_tf.py
new file mode 100644
index 0000000..4700629
--- /dev/null
+++ b/examples/shuffle_iter_avg/model_tf.py
@@ -0,0 +1,61 @@
+import os
+import numpy as np
+import tensorflow as tf
+from tensorflow.keras.layers import Dense, Flatten, Conv2D
+from tensorflow.keras import Model
+
+
+def get_hyperparams():
+ local_params = {
+ 'training': {
+ 'epochs': 3
+ }
+ }
+
+ return local_params
+
+def get_model_config(folder_configs, dataset, is_agg=False, party_id=0):
+ if is_agg:
+ return None
+
+ class MyModel(Model):
+ def __init__(self):
+ super(MyModel, self).__init__()
+ self.conv1 = Conv2D(32, 3, activation='relu')
+ self.flatten = Flatten()
+ self.d1 = Dense(128, activation='relu')
+ self.d2 = Dense(10)
+
+ def call(self, x):
+ x = self.conv1(x)
+ x = self.flatten(x)
+ x = self.d1(x)
+ return self.d2(x)
+
+ # Create an instance of the model
+ model = MyModel()
+ loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
+ from_logits=True)
+ optimizer = tf.keras.optimizers.Adam()
+ acc = tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy')
+ model.compile(optimizer=optimizer, loss=loss_object, metrics=[acc])
+ img_rows, img_cols = 28, 28
+ input_shape = (None, img_rows, img_cols, 1)
+ model.compute_output_shape(input_shape=input_shape)
+
+ if not os.path.exists(folder_configs):
+ os.makedirs(folder_configs)
+
+ model.save(folder_configs)
+
+ spec = {'model_name': 'tf-cnn',
+ 'model_definition': folder_configs}
+
+ model = {
+ 'name': 'TensorFlowFLModel',
+ 'path': 'ibmfl.model.tensorflow_fl_model',
+ 'spec': spec
+ }
+
+ return model
+
diff --git a/examples/sklearn_logclassification_globalrw/README.md b/examples/sklearn_logclassification_globalrw/README.md
index 1dd36d6..5472540 100644
--- a/examples/sklearn_logclassification_globalrw/README.md
+++ b/examples/sklearn_logclassification_globalrw/README.md
@@ -74,4 +74,4 @@ The value of `epsilon` is, at default, set to 1, and can be modified via the dat
Type `START` and press enter to start accepting connections.
Type `REGISTER` and press enter to register the party with the aggregator.
-- Finally, start training by entering `TRAIN` in the aggregator terminal.
+- Finally, start training by entering `TRAIN` in the aggregator terminal.
\ No newline at end of file
diff --git a/examples/sklearn_logclassification_globalrw/generate_configs.py b/examples/sklearn_logclassification_globalrw/generate_configs.py
index 425730a..9c3921f 100644
--- a/examples/sklearn_logclassification_globalrw/generate_configs.py
+++ b/examples/sklearn_logclassification_globalrw/generate_configs.py
@@ -13,7 +13,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'ReweighLocalTrainingHandler',
'path': 'ibmfl.party.training.reweigh_local_training_handler'
@@ -39,7 +39,7 @@ def get_hyperparams(model):
def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='sklearn'):
- SUPPORTED_DATASETS = ['adult', 'compas', 'german']
+ SUPPORTED_DATASETS = ['adult', 'compas', 'german', 'custom_dataset']
if dataset in SUPPORTED_DATASETS:
if dataset == 'adult':
diff --git a/examples/sklearn_logclassification_rw/README.md b/examples/sklearn_logclassification_rw/README.md
index 9db34d4..f6bcc9f 100644
--- a/examples/sklearn_logclassification_rw/README.md
+++ b/examples/sklearn_logclassification_rw/README.md
@@ -8,7 +8,8 @@ Currently, for Logistic Regression Reweighing we support following datasets:
# Running Scikitlearn Logistic Classifier in FL with Local Reweighing
-This example explains how to run federated learning on a Logistic Classifier, implemented with Scikit-Learn with Local Reweighing. Local Reweighing method is presented in [Mitigating Bias in Federated Learning](https://arxiv.org/abs/2012.02447).
+This example explains how to run federated learning on a Logistic Classifier, implemented with Scikit-Learn with Local Reweighing. Local Reweighing is an implementation of the [Reweighing](https://link.springer.com/article/10.1007/s10115-011-0463-8) method. We use this algorithm in our paper on bias mitigation and federated learning; see it [here]()https://arxiv.org/abs/2012.02447).
+
The following preprocessing was performed in `AdultSklearnDataHandler` on the original dataset:
* Drop following features: `workclass`, `fnlwgt`, `education`, `marital-status`, `occupation`, `relationship`, `capital-gain`, `capital-loss`, `hours-per-week`, `native-country`
diff --git a/examples/sklearn_logclassification_rw/generate_configs.py b/examples/sklearn_logclassification_rw/generate_configs.py
index 77e7f8e..3e8710a 100644
--- a/examples/sklearn_logclassification_rw/generate_configs.py
+++ b/examples/sklearn_logclassification_rw/generate_configs.py
@@ -13,7 +13,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'ReweighLocalTrainingHandler',
'path': 'ibmfl.party.training.reweigh_local_training_handler'
@@ -39,7 +39,7 @@ def get_hyperparams(model):
def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='sklearn'):
- SUPPORTED_DATASETS = ['adult', 'compas', 'german']
+ SUPPORTED_DATASETS = ['adult', 'compas', 'german', 'custom_dataset']
if dataset in SUPPORTED_DATASETS:
if dataset == 'adult':
diff --git a/examples/spahm/generate_configs.py b/examples/spahm/generate_configs.py
index a7de7f1..c1ec783 100644
--- a/examples/spahm/generate_configs.py
+++ b/examples/spahm/generate_configs.py
@@ -14,7 +14,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'LocalTrainingHandler',
'path': 'ibmfl.party.training.local_training_handler'
@@ -42,7 +42,7 @@ def get_hyperparams(model='sklearn'):
def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='sklearn'):
- SUPPORTED_DATASETS = ['federated-clustering']
+ SUPPORTED_DATASETS = ['federated-clustering', 'custom_dataset']
if dataset in SUPPORTED_DATASETS:
data = datahandlers.get_datahandler_config(
dataset, folder_data, party_id, is_agg)
diff --git a/examples/zeno/generate_configs.py b/examples/zeno/generate_configs.py
index 2a88fc1..2a82308 100644
--- a/examples/zeno/generate_configs.py
+++ b/examples/zeno/generate_configs.py
@@ -13,7 +13,7 @@ def get_fusion_config():
return fusion
-def get_local_training_config():
+def get_local_training_config(configs_folder=None):
local_training_handler = {
'name': 'GradientLocalTrainingHandler',
'path': 'ibmfl.party.training.gradient_local_training_handler'
@@ -42,7 +42,7 @@ def get_hyperparams(model):
def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='keras'):
- SUPPORTED_DATASETS = ['mnist']
+ SUPPORTED_DATASETS = ['mnist', 'custom_dataset']
if dataset in SUPPORTED_DATASETS:
if dataset == 'mnist' and model == 'tf':
dataset = 'mnist_tf'
diff --git a/examples/zeno/model_keras.py b/examples/zeno/model_keras.py
index 9377b37..3422b2b 100644
--- a/examples/zeno/model_keras.py
+++ b/examples/zeno/model_keras.py
@@ -10,16 +10,13 @@ def get_hyperparams():
local_params = {
'training': {
'epochs': 3
- },
- 'optimizer': {
- 'lr': 0.01
}
}
return local_params
-def get_model_config(folder_configs, dataset, is_agg=False, party_id=0):
+def get_model_config(folder_configs, dataset, is_agg=False, party_id=0):
num_classes = 10
img_rows, img_cols = 28, 28
if K.image_data_format() == 'channels_first':
diff --git a/examples/zeno/model_pytorch.py b/examples/zeno/model_pytorch.py
index c6c53fe..438f2d4 100644
--- a/examples/zeno/model_pytorch.py
+++ b/examples/zeno/model_pytorch.py
@@ -16,7 +16,6 @@ def get_hyperparams():
def get_model_config(folder_configs, dataset, is_agg=False, party_id=0):
-
model = nn.Sequential(nn.Conv2d(1, 32, 3, 1),
nn.ReLU(),
nn.Conv2d(32, 64, 3, 1),
diff --git a/experiment_manager/Experiment_Manager_dashboard.ipynb b/experiment_manager/Experiment_Manager_dashboard.ipynb
new file mode 100644
index 0000000..c0083c4
--- /dev/null
+++ b/experiment_manager/Experiment_Manager_dashboard.ipynb
@@ -0,0 +1,537 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Dashboard\n",
+ "\n",
+ "In this Notebook, we interact with the Experiment Manager to configure, setup and run experiments."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Import dependencies, initialise configs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "code_folding": [
+ 0
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "## Imports and such\n",
+ "import sys\n",
+ "sys.path.append('../')\n",
+ "%config Completer.use_jedi = False # to avoid autocomplete errors in Jupyter server\n",
+ "from ipywidgets import GridspecLayout, GridBox, Layout, Output\n",
+ "import dashboard_ui\n",
+ "\n",
+ "dashboard_ui = dashboard_ui.DashboardUI()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Choose Model, Dataset and Fusion Algorithm"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Provide Data Handler\n",
+ "- Only if you wish to use a Custom Dataset\n",
+ "- Choose Yes in the `Custom Dataset?` option below\n",
+ "\n",
+ "Populate and then run the cell below to save the provided Data Handler class to file."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "code_folding": [
+ 0
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "%%writefile custom_data_handler.py\n",
+ "### YOUR DATAHANDLER code goes below\n",
+ "\n",
+ "import logging\n",
+ "\n",
+ "import numpy as np\n",
+ "\n",
+ "from ibmfl.data.data_handler import DataHandler\n",
+ "from ibmfl.util.datasets import load_mnist\n",
+ "\n",
+ "logger = logging.getLogger(__name__)\n",
+ "\n",
+ "\n",
+ "class MnistKerasDataHandler(DataHandler):\n",
+ " \"\"\"\n",
+ " Data handler for MNIST dataset.\n",
+ " \"\"\"\n",
+ "\n",
+ " def __init__(self, data_config=None, channels_first=False):\n",
+ " super().__init__()\n",
+ "\n",
+ " self.file_name = None\n",
+ " if data_config is not None:\n",
+ " # Ensure your data files are either npz or csv\n",
+ " if 'npz_file' in data_config:\n",
+ " self.file_name = data_config['npz_file']\n",
+ " elif 'txt_file' in data_config:\n",
+ " self.file_name = data_config['txt_file']\n",
+ " self.channels_first = channels_first\n",
+ "\n",
+ " # load the datasets\n",
+ " (self.x_train, self.y_train), (self.x_test, self.y_test) = self.load_dataset()\n",
+ "\n",
+ " # pre-process the datasets\n",
+ " self.preprocess()\n",
+ "\n",
+ " def get_data(self):\n",
+ " \"\"\"\n",
+ " Gets pre-process mnist training and testing data.\n",
+ "\n",
+ " :return: the training and testing data.\n",
+ " :rtype: `tuple`\n",
+ " \"\"\"\n",
+ " return (self.x_train, self.y_train), (self.x_test, self.y_test)\n",
+ "\n",
+ " def load_dataset(self, nb_points=500):\n",
+ " \"\"\"\n",
+ " Loads the training and testing datasets from a given local path. \\\n",
+ " If no local path is provided, it will download the original MNIST \\\n",
+ " dataset online, and reduce the dataset size to contain \\\n",
+ " 500 data points per training and testing dataset. \\\n",
+ " Because this method \\\n",
+ " is for testing it takes as input the number of datapoints, nb_points, \\\n",
+ " to be included in the training and testing set.\n",
+ "\n",
+ " :param nb_points: Number of data points to be included in each set if\n",
+ " no local dataset is provided.\n",
+ " :type nb_points: `int`\n",
+ " :return: training and testing datasets\n",
+ " :rtype: `tuple`\n",
+ " \"\"\"\n",
+ " if self.file_name is None:\n",
+ " (x_train, y_train), (x_test, y_test) = load_mnist()\n",
+ " # Reduce datapoints to make test faster\n",
+ " x_train = x_train[:nb_points]\n",
+ " y_train = y_train[:nb_points]\n",
+ " x_test = x_test[:nb_points]\n",
+ " y_test = y_test[:nb_points]\n",
+ " else:\n",
+ " try:\n",
+ " logger.info('Loaded training data from ' + str(self.file_name))\n",
+ " data_train = np.load(self.file_name)\n",
+ " x_train = data_train['x_train']\n",
+ " y_train = data_train['y_train']\n",
+ " x_test = data_train['x_test']\n",
+ " y_test = data_train['y_test']\n",
+ " except Exception:\n",
+ " raise IOError('Unable to load training data from path '\n",
+ " 'provided in config file: ' +\n",
+ " self.file_name)\n",
+ " return (x_train, y_train), (x_test, y_test)\n",
+ "\n",
+ " def preprocess(self):\n",
+ " \"\"\"\n",
+ " Preprocesses the training and testing dataset, \\\n",
+ " e.g., reshape the images according to self.channels_first; \\\n",
+ " convert the labels to binary class matrices.\n",
+ "\n",
+ " :return: None\n",
+ " \"\"\"\n",
+ " num_classes = 10\n",
+ " img_rows, img_cols = 28, 28\n",
+ "\n",
+ " if self.channels_first:\n",
+ " self.x_train = self.x_train.reshape(self.x_train.shape[0], 1, img_rows, img_cols)\n",
+ " self.x_test = self.x_test.reshape(self.x_test.shape[0], 1, img_rows, img_cols)\n",
+ " else:\n",
+ " self.x_train = self.x_train.reshape(self.x_train.shape[0], img_rows, img_cols, 1)\n",
+ " self.x_test = self.x_test.reshape(self.x_test.shape[0], img_rows, img_cols, 1)\n",
+ "\n",
+ " # convert class vectors to binary class matrices\n",
+ " self.y_train = np.eye(num_classes)[self.y_train]\n",
+ " self.y_test = np.eye(num_classes)[self.y_test]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "code_folding": [
+ 0
+ ],
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "## Model, Dataset and Fusion Algorithm\n",
+ "\n",
+ "components = dashboard_ui.generate_model_dataset_fusion_ui()\n",
+ "\n",
+ "# GridBox layout for UI\n",
+ "grid = GridspecLayout(2,2)\n",
+ "\n",
+ "grid[0,:] = GridBox(children=list(components[:-4]),\n",
+ " layout=Layout(\n",
+ " width='100%',\n",
+ " grid_template_rows='auto auto',\n",
+ " grid_template_columns='48% 48%',\n",
+ " grid_template_areas='''\n",
+ " \"model_header model_header\"\n",
+ " \"model_dr model_upload\"\n",
+ " \"dataset_header dataset_header\"\n",
+ " \"dataset dataset_spl\"\n",
+ " \"ppp ppp\"\n",
+ " '''\n",
+ "# ,border='0.5px solid black'\n",
+ " ))\n",
+ "\n",
+ "grid[1,:] = GridBox(children=list(components[-4:]),\n",
+ " layout=Layout(\n",
+ " height='150px',\n",
+ " width='100%',\n",
+ " grid_template_rows='auto auto',\n",
+ " grid_template_columns='48% 48%',\n",
+ " grid_gap = '0px 0px',\n",
+ " grid_template_areas='''\n",
+ " \"custom_data custom_data_html\"\n",
+ " \"fusion_dr metrics_choice\"\n",
+ " '''\n",
+ "# , border='0.5px solid black'\n",
+ " ))\n",
+ "# grid[2,:] = GridBox(children=list(components[-1:]),\n",
+ "# layout=Layout(\n",
+ "# height='55px',\n",
+ "# width='auto',\n",
+ "# grid_template_rows='100%',\n",
+ "# grid_template_columns='100%',\n",
+ "# grid_template_areas='''\n",
+ "# \"fusion_dr\"\n",
+ "# ''',\n",
+ "# border='0.5px solid black'\n",
+ "# ))\n",
+ "grid"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Choose number of parties and hyperparameters\n",
+ "Ensure you click `Confirm Hyperparameters` when done!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "code_folding": [
+ 0
+ ],
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "## Parties and Hyperparameters\n",
+ "\n",
+ "components = list(dashboard_ui.generate_parties_hyperparams_ui())\n",
+ "\n",
+ "# GridBox layout for UI\n",
+ "grid = GridspecLayout(2,3)\n",
+ "\n",
+ "grid[0,:] = GridBox(children=components[:-2],\n",
+ " layout = Layout(\n",
+ " width='100%',\n",
+ " grid_template_rows='auto auto',\n",
+ " grid_template_columns='48% 48%',\n",
+ " grid_template_areas='''\n",
+ " \"header_parties header_parties\"\n",
+ " \"parties parties\"\n",
+ " \"header_hyperparams header_hyperparams\"\n",
+ " ''')\n",
+ " )\n",
+ "# Nested grid to vary spacing across various widgets\n",
+ "sub_grid_hyperparams = GridspecLayout(2,3)\n",
+ "sub_grid_hyperparams[0,:] = components[-1]\n",
+ "sub_grid_hyperparams[1,1] = components[-2]\n",
+ "\n",
+ "grid[1, :] = sub_grid_hyperparams\n",
+ "\n",
+ "party_hyperparam_ui = Output()\n",
+ "\n",
+ "with party_hyperparam_ui:\n",
+ " display(grid)\n",
+ "party_hyperparam_ui"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Provide Party specific data files\n",
+ "\n",
+ "- Only if you wish to use a Custom Dataset\n",
+ "- Chose Yes in the `Custom Dataset?` option in Step 1.2 above"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "code_folding": [
+ 0
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "## Upload party data files for each party:\n",
+ "if 'custom_data' in dashboard_ui.mgr.nb_config:\n",
+ " upload_boxes = dashboard_ui.generate_custom_party_data_ui()\n",
+ " for each in upload_boxes:\n",
+ " display(each)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Choose whether to run locally or on remote machines"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "code_folding": [
+ 0
+ ],
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "## Local or Remote run\n",
+ "\n",
+ "components = dashboard_ui.generate_local_remote_ui()\n",
+ "# grid for displaying networking fields -- IP addr, port, ssh user, paths\n",
+ "partyDetails_grid = GridspecLayout(1,3)\n",
+ "partyDetails_grid[0, :] = components[1] # networking_deets_box \n",
+ "\n",
+ "display(components[0])\n",
+ "partyDetails_grid"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Generate and View Aggregator and Party Config"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "code_folding": [
+ 0
+ ]
+ },
+ "outputs": [],
+ "source": [
+ "## Generate Configs and Display them\n",
+ "\n",
+ "components = dashboard_ui.generate_display_configs_ui()\n",
+ "\n",
+ "# grid for displaying generated configurations\n",
+ "display_grid_1 = GridspecLayout(1,3)\n",
+ "display_grid_1[0, :] = components[1] # config_box\n",
+ "\n",
+ "display_grid_1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Run the Experiment and Visualise Metrics\n",
+ "If the configs above look alright, go ahead and run the cell below to run the experiment!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "code_folding": [
+ 0
+ ],
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "## Run the experiment and see charts\n",
+ "\n",
+ "import ibmfl_cli_automator.run as ibmfl_runner\n",
+ "from ipywidgets import Button, VBox, Output\n",
+ "\n",
+ "exp_runner = ibmfl_runner.Runner()\n",
+ "\n",
+ "monitoring_box = VBox()\n",
+ "\n",
+ "no_plots_for_these = ['Federated Averaging', 'Gradient Averaging', 'Probabilistic Federated Neural Matching', 'Zeno', 'Shuffled Iterative Avg']\n",
+ "\n",
+ "plot_button = Button(\n",
+ " description='Show Charts',\n",
+ " disabled=False,\n",
+ " button_style='warning', # 'success', 'info', 'warning', 'danger' or ''\n",
+ " tooltip='Displays the various plots for the experiment that ran',\n",
+ " layout = Layout(width='120px', height='40px', margin='5px 50px 5px 400px') ## margin to position button centrally\n",
+ " )\n",
+ "\n",
+ "\n",
+ "def invoke_runner():\n",
+ " monitoring_out = Output(layout={'border': '0.5px solid black'})\n",
+ " monitoring_box.children = [monitoring_out]\n",
+ " display(display_grid_2)\n",
+ "\n",
+ " # some values needed by the Runner; there's only one trial for now\n",
+ " dashboard_ui.mgr.run_details['experiments'][0]['shuffle_party_machines'] = False\n",
+ " dashboard_ui.mgr.run_details['experiments'][0]['n_trials'] = 1\n",
+ " dashboard_ui.mgr.run_details['experiments'][0]['n_parties'] = dashboard_ui.mgr.nb_config['global']['num_parties']\n",
+ " dashboard_ui.mgr.run_details['experiments'][0]['n_rounds'] = dashboard_ui.mgr.nb_config['global']['rounds']\n",
+ "\n",
+ " # values for postprocessing and showing default metrics\n",
+ " if dashboard_ui.mgr.nb_config['record_metrics']:\n",
+ " dashboard_ui.mgr.run_details['experiments'][0]['postproc_fn'] = {}\n",
+ " dashboard_ui.mgr.run_details['experiments'][0]['postproc_fn'] = 'gen_reward_vs_time_plots'\n",
+ " dashboard_ui.mgr.run_details['experiments'][0]['postproc_x_key'] = 'post_train:ts'\n",
+ " dashboard_ui.mgr.run_details['experiments'][0]['postproc_y_keys'] = ['post_train:eval:loss', 'post_train:eval:acc']#, 'post_train:eval:precision weighted', 'post_train:eval:recall weighted']\n",
+ "\n",
+ " exp_machines = exp_runner.convert_machine_dict_from_nb_to_cli(dashboard_ui.mgr.run_details['machines'])\n",
+ "\n",
+ " for exp_info in dashboard_ui.mgr.run_details['experiments']:\n",
+ " with open('{}/config_agg.yml'.format(dashboard_ui.mgr.nb_config['local_conf_dir']), 'r') as config_agg_file:\n",
+ " config_agg = config_agg_file.read()\n",
+ " config_parties = []\n",
+ " for pi in range(exp_info['n_parties']):\n",
+ " with open('{}/config_party{}.yml'.format(dashboard_ui.mgr.nb_config['local_conf_dir'], pi), 'r') as config_party_file:\n",
+ " config_parties += [config_party_file.read()]\n",
+ " with monitoring_out:\n",
+ " display(exp_runner.run_experiment(exp_info, dashboard_ui.mgr.run_details['machines'],\n",
+ " config_agg, config_parties, ui_mode='nb', ts=dashboard_ui.mgr.nb_config['timestamp_str']) \\\n",
+ " or 'Finished!')\n",
+ "\n",
+ " if dashboard_ui.mgr.nb_config['record_metrics']:\n",
+ " if 'Keras' in dashboard_ui.mgr.nb_config['model'] and dashboard_ui.mgr.nb_config['fusion'] not in no_plots_for_these:\n",
+ " # only some Keras models have plots currently\n",
+ " monitoring_box.children = monitoring_box.children + (plot_button,)\n",
+ " else:\n",
+ " with monitoring_out:\n",
+ " display('Plots for chosen model/fusion algorithm are not supported yet') # metrics processing not in place\n",
+ " else:\n",
+ " with monitoring_out:\n",
+ " display('No metrics were recorded, so no plots to show')\n",
+ "\n",
+ "plots_box = VBox()\n",
+ "\n",
+ "def get_plots(b):\n",
+ " b.disabled = True\n",
+ " plots_out = Output(layout={'border': '0.5px solid black'})\n",
+ " plots_box.children = [plots_out]\n",
+ " display(display_grid_3)\n",
+ " # generate the plot(s)\n",
+ " with plots_out:\n",
+ " display(exp_info = exp_runner.call_postproc_fn())\n",
+ "\n",
+ "plot_button.on_click(get_plots)\n",
+ "\n",
+ "# grid for displaying progress of running experiment\n",
+ "display_grid_2 = GridspecLayout(1,1)\n",
+ "display_grid_2[0, :] = monitoring_box\n",
+ "\n",
+ "# grid for displaying charts from collected metrics\n",
+ "display_grid_3 = GridspecLayout(1,1)\n",
+ "display_grid_3[0, :] = plots_box\n",
+ "\n",
+ "invoke_runner()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "ibmfl-venv",
+ "language": "python",
+ "name": "ibmfl-venv"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.8"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ },
+ "varInspector": {
+ "cols": {
+ "lenName": 16,
+ "lenType": 16,
+ "lenVar": 40
+ },
+ "kernels_config": {
+ "python": {
+ "delete_cmd_postfix": "",
+ "delete_cmd_prefix": "del ",
+ "library": "var_list.py",
+ "varRefreshCmd": "print(var_dic_list())"
+ },
+ "r": {
+ "delete_cmd_postfix": ") ",
+ "delete_cmd_prefix": "rm(",
+ "library": "var_list.r",
+ "varRefreshCmd": "cat(var_dic_list()) "
+ }
+ },
+ "types_to_exclude": [
+ "module",
+ "function",
+ "builtin_function_or_method",
+ "instance",
+ "_Feature"
+ ],
+ "window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/runner/exp_manager/README.md b/experiment_manager/README.md
similarity index 50%
rename from runner/exp_manager/README.md
rename to experiment_manager/README.md
index ac92b90..cee5524 100644
--- a/runner/exp_manager/README.md
+++ b/experiment_manager/README.md
@@ -1,19 +1,33 @@
## Experiment Manager Dashboard
-Jupyter Notebook frontend for orchestrating Federated Learning experiments
-
+Developers and Federated Learning researchers often need to experiment with combinations of models, datasets and fusion algorithms. While a command line interface provides access to low-level granularities, the Experiment Manager Dashboard facilitates setting up an FL experiment, orchestrating the same on the local machine or remote VMs, and finally collating results from the experiment- all through a single Jupyter Notebook interface.
+
+### Features:
+- Setup FL experiments using either of Keras, PyTorch, Scikit-learn and TensorFlow models
+- Aggregate during FL training using fusion algorithms such as _Iterative Averaging_, _Coordinate Median_, _Krum_, _Zeno_ and several others, including the _Fed+ family_ of algorithms
+- Use pre-populated or custom datasets for parties to train on
+- Orchestrate experiments on local machine or on remote VM(s)
+- Visualise metrics after training completes, or collect logs for your own postprocessing
### Setup
+
+**Note:**
+There are limitations to using `conda` when spawning the many FL processes when orchestrating an experiment. This is also noted in [Issue #7980](https://github.com/conda/conda/issues/7980).
+
+A virtual environment setup through `venv`, a [module](https://docs.python.org/3/library/venv.html) from the standard library has no such issues. Therefore, until there is a native cross-platform fix for this, the dashboard shall only support `venv`.
+
+---
+
The UI components in the Notebook use [ipywidgets](https://ipywidgets.readthedocs.io/en/stable/user_install.html) which can be installed using:
`pip install ipywidgets`
-or
-`conda install -c conda-forge ipywidgets`
In order to hide away the source code, as you interact with the dashboard, install [Jupyter notebook extensions](https://github.com/ipython-contrib/jupyter_contrib_nbextensions#jupyterlab) as
`pip install jupyter_contrib_nbextensions`
-or
-`conda install -c conda-forge jupyter_contrib_nbextensions`
-Once installed, follow the instructiions at their Github page to configure it to work on your end.
+Once installed, follow the instructions at their GitHub page to configure it to work on your end.
+
+The dashboard functionality is split between two classes:
+- `DashboardUI`: includes all Notebook widgets, their handlers and some related logic
+- `ConfigManager`: includes all configuration manipulation logic, as well as various objects that are populated to pass onto the runner module for running experiments
### Run the notebook
Finally, running `jupyter notebook Experiment_Manager_Dashboard.ipynb` from within the IBMFL virtual environment should open up the Notebook dashboard in your web browser. Once the notebook is up and running, you may follow the steps as shown [here](usage_guide.md).
@@ -44,4 +58,4 @@ Finally, running `jupyter notebook Experiment_Manager_Dashboard.ipynb` from with
Note that while this is the general set of steps, some OSes may require additional settings to be tweaked.
-For MacOS, you may have to addtionally go to _System Preferences_ > _Sharing_ > _Remote Login_ and add your user to the list of users allowed remote access into the machine (in case you get a permission error)
+For macOS, you may have to additionally go to _System Preferences_ > _Sharing_ > _Remote Login_ and add your user to the list of users allowed remote access into the machine (in case you get a permission error)
diff --git a/experiment_manager/config_manager.py b/experiment_manager/config_manager.py
new file mode 100644
index 0000000..156b12f
--- /dev/null
+++ b/experiment_manager/config_manager.py
@@ -0,0 +1,391 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
+import sys
+sys.path.append('../')
+import experiment_manager.ibmfl_cli_automator.run as ibmfl_runner
+import json
+import os
+import subprocess
+import yaml
+import pandas as pd
+
+
+class ConfigManager:
+ """
+ The ConfigManager class contains all non-UI logic from the dashboard, necessary to populate objects necessary
+ for invoking the runner module.
+ """
+
+ def __init__(self):
+ self.file_for_supported_combinations = 'supported_models.csv'
+ self.file_for_hyperparams = 'hyperparams_to_models_map.json'
+
+ # dict to store choices made via Notebook UI
+ self.nb_config = {'split': {}}
+ # set defaults
+ self.nb_config['split']['ppp'] = 100
+ self.nb_config['split']['method'] = 'Uniform Random Sampling'
+ self.nb_config['parties'] = 5
+ self.nb_config['quorum'] = 1
+ self.nb_config['record_metrics'] = False
+
+ # Store all supported datasets, models and algorithms in a pandas dataframe
+ self.df = pd.read_csv(filepath_or_buffer=self.file_for_supported_combinations, header=0,
+ names=['fusion_identifier', 'fusion_algo', 'dataset', 'model_spec_name', 'fl_model',
+ 'model_ui'], skipinitialspace=True)
+ self.df_hyperparams = pd.read_json(path_or_buf=self.file_for_hyperparams)
+
+ self.uimodel_modelid_dict = {
+ 'Keras': 'keras',
+ 'PyTorch': 'pytorch',
+ 'TensorFlow': 'tf',
+ 'Scikit-learn': 'sklearn',
+ 'None': 'None'
+ }
+
+ # dict to store details such as machines to run on, paths etc
+ self.run_details = {}
+
+ self.exp_runner = ibmfl_runner.Runner()
+
+ def generate_update_configs(self):
+ # Get timestamp and add it to the given local staging directory:
+ self.nb_config['timestamp_str'] = self.exp_runner.generate_timestamp()
+ trial_dir = self.run_details['experiments'][0]['local_staging_dir'] + '/' + self.nb_config['timestamp_str']
+
+ # Create the staging_directory:
+ mkdir_cmd = 'mkdir -p ' + trial_dir
+ process = subprocess.run(mkdir_cmd, shell=True,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ if process.returncode != 0:
+ print('Erred: ', process.stderr)
+ return None, None
+
+ if 'custom_data' in self.nb_config:
+ self.move_uploaded_files_to_trial_dir(trial_dir)
+
+ if 'custom_data' not in self.nb_config:
+ # Generate Data
+ print('Generating Data...')
+
+ cmd_to_run = 'cd ../; python3 examples/generate_data.py --num_parties ' + str(
+ self.nb_config['parties']) + ' -d ' + self.nb_config['dataset'] + ' -pp ' + str(
+ self.nb_config['split']['ppp']) + ' -p ' + trial_dir # there's only one trial for now
+ if 'Stratified' in self.nb_config['split']['method']:
+ cmd_to_run = cmd_to_run + ' --stratify'
+
+ # print('Executing {}'.format(cmd_to_run))
+ process = subprocess.run(cmd_to_run, shell=True,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE)
+ if process.returncode != 0:
+ print('Erred: ', process.stderr)
+ return None, None
+
+ # path to get datasets from
+ data_path = str(process.stdout).split('Data saved in')[-1].strip().replace('\\n\'', '')
+ print('Data files saved to: {}'.format(data_path))
+
+ # Generate Configs:
+ print('Generating Configs...')
+ if 'crypto' in self.nb_config['fusion_identifier']:
+ # if it has either of crypto keras or crypto_multiclass_keras, we need -crypto flags:
+ # Todo: Need to let user pick one of {Paillier, ThresholdPaillier}
+ cmd_to_run = 'cd ../; python3 examples/generate_configs.py' + \
+ ' --num_parties ' + str(self.nb_config['parties']) + \
+ ' -f ' + self.nb_config['fusion_identifier'] + \
+ ' -m ' + self.uimodel_modelid_dict[self.nb_config['model']] + \
+ ' -crypto Paillier' + \
+ ' --config_path ' + trial_dir # there's only one trial for now
+ else:
+ cmd_to_run = 'cd ../; python3 examples/generate_configs.py' + \
+ ' --num_parties ' + str(self.nb_config['parties']) + \
+ ' -f ' + self.nb_config['fusion_identifier'] + \
+ ' -m ' + self.uimodel_modelid_dict[self.nb_config['model']] + \
+ ' --config_path ' + trial_dir # there's only one trial for now
+
+ # add -d and -p flags accordingly
+ if 'custom_data' in self.nb_config:
+ cmd_to_run = cmd_to_run + ' -d custom_dataset -p "" '# we replace the path down below anyway
+ else:
+ cmd_to_run = cmd_to_run + ' -d ' + self.nb_config['dataset'] + ' -p ' + data_path
+
+ # print('Executing {}'.format(cmd_to_run))
+ process = subprocess.run(cmd_to_run, shell=True,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ universal_newlines=True)
+ if process.returncode == 0:
+ # save agg and party configs path
+ configs_path = os.path.dirname(process.stdout.split('\n')[0].split(':')[1].strip())
+ path_to_save_agg_configs = configs_path + '/config_agg.yml'
+ print('Aggregator configs saved to: {}'.format(path_to_save_agg_configs))
+ path_to_save_party_configs = configs_path + '/config_party*.yml'
+ print('Party configs saved to: {}'.format(path_to_save_party_configs))
+ else:
+ print('Erred: ', process.stderr)
+ return None, None
+
+ # modify hyperparameter text to fix quotes
+ hyp_text = self.nb_config['global']
+ # Python uses True/False, while JSON does true/false
+ if 'True' in hyp_text:
+ hyp_text = json.loads(hyp_text.replace('\'', '"').replace('True', 'true'))
+ elif 'False' in hyp_text:
+ hyp_text = json.loads(hyp_text.replace('\'', '"').replace('False', 'false'))
+ else:
+ hyp_text = json.loads(hyp_text.replace('\'', '"'))
+ if 'plus' in self.nb_config['fusion_identifier']:
+ rho = hyp_text['rho']
+ self.nb_config['global'] = hyp_text
+
+ if 'local' in self.nb_config.keys():
+ hyp_text = self.nb_config['local']
+ # Python uses True/False, while JSON does true/false
+ if 'True' in hyp_text:
+ hyp_text = json.loads(hyp_text.replace('\'', '"').replace('True', 'true'))
+ elif 'False' in hyp_text:
+ hyp_text = json.loads(hyp_text.replace('\'', '"').replace('False', 'false'))
+ else:
+ hyp_text = json.loads(hyp_text.replace('\'', '"'))
+
+ if 'plus' in self.nb_config['fusion_identifier']:
+ alpha = hyp_text['training'].pop('alpha')
+ self.nb_config['local'] = hyp_text
+
+ # add num_parties as a key under global, to match the structure in the agg yaml configs
+ val = self.nb_config.pop('parties')
+ self.nb_config['global']['num_parties'] = val
+ val = self.nb_config.pop('quorum')
+ self.nb_config['global']['perc_quorum'] = val
+
+ # Load Aggregator Config
+ with open(path_to_save_agg_configs, 'r') as stream:
+ try:
+ agg_config = yaml.safe_load(stream)
+ except yaml.YAMLError as e:
+ print(e)
+ return None, None
+
+ # for local runs, update the dirs to all the "machines" (they're all local)
+ if self.run_details['isLocalRun']:
+ self.run_details['machines']['ibmfl_dir'] = self.run_details['experiments'][0]['local_ibmfl_dir']
+ self.run_details['machines']['staging_dir'] = self.run_details['experiments'][0]['local_staging_dir']
+
+ # Modify aggregator config with values captured from the UI:
+ # - update the hyperparameters object with newer global and local objects as updated above
+ # - update ip and port from the run_details object
+ # - update data handler path to reflect custom datahandler code, if chosen/provided
+ agg_config['hyperparams']['global'] = self.nb_config['global']
+ if 'local' in self.nb_config.keys():
+ agg_config['hyperparams']['local'] = self.nb_config['local']
+ agg_machine = self.run_details['experiments'][0]['agg_machine'] # there's only one trial for now
+
+ if not self.run_details['isLocalRun']:
+ agg_config['connection']['info']['ip'] = self.run_details['machines'][agg_machine]['ip_address']
+ agg_config['connection']['info']['port'] = int(self.run_details['machines'][agg_machine]['port_number'])
+ # Todo: support custom dataset for remote runs
+ else:
+ self.run_details['machines'][agg_machine]['ip_address'] = agg_config['connection']['info']['ip']
+ self.run_details['machines'][agg_machine]['port_number'] = agg_config['connection']['info']['port']
+ self.run_details['machines'][agg_machine]['ssh_username'] = os.getenv('USER')
+
+ if 'custom_model' in self.nb_config and 'model' in agg_config:
+ dst = self.move_model_file_to_trial_dir(agg_config)
+ agg_config['model']['spec']['model_definition'] = dst
+
+ # Write this updated yaml to file
+ with open(path_to_save_agg_configs, 'w') as out:
+ yaml.safe_dump(agg_config, out, default_flow_style=False)
+ print('Updated Aggregator config at {}'.format(path_to_save_agg_configs))
+
+ # Modify party config with values accepted from the UI
+ # - update IP address, port for agg and party as received from the UI (only remote runs)
+ # - add metrics section (both remote and local run) -- if needed
+ # - add alpha, if model chosen is Fed+
+ # - update data handler path to reflect custom datahandler code, if chosen/provided
+ if not self.run_details['isLocalRun']:
+ currParty = 0
+ for eachMachine in self.run_details['experiments'][0]['party_machines']: # there's only one trial for now
+ # Load
+ with open(path_to_save_party_configs.replace('*', str(currParty))) as stream:
+ try:
+ party_config = yaml.safe_load(stream)
+ except yaml.YAMLError as e:
+ print(e)
+ return None, None
+
+ agg_machine = self.run_details['experiments'][0]['agg_machine'] # there's only one trial for now
+ # Modify
+ party_config['aggregator']['ip'] = self.run_details['machines'][agg_machine]['ip_address']
+ party_config['aggregator']['port'] = self.run_details['machines'][agg_machine]['port_number']
+
+ party_config['connection']['info']['ip'] = self.run_details['machines'][eachMachine]['ip_address']
+ party_config['connection']['info']['port'] = int(
+ self.run_details['machines'][eachMachine]['port_number'])
+ party_config['connection']['info']['port'] = int(
+ self.run_details['machines'][eachMachine]['port_number'])
+ # Todo: DRY!
+ if self.nb_config['record_metrics']:
+ # Metrics section to add to each party config
+ party_config['metrics_recorder'] = {}
+ party_config['metrics_recorder']['name'] = 'MetricsRecorder'
+ party_config['metrics_recorder']['path'] = 'ibmfl.party.metrics.metrics_recorder'
+ party_config['metrics_recorder']['output_file'] = '${config_dir}/metrics_party${id}'.replace(
+ '${config_dir}', self.run_details['machines'][eachMachine]['staging_dir']).replace('${id}',
+ str(currParty))
+ party_config['metrics_recorder']['output_type'] = 'json'
+ party_config['metrics_recorder']['compute_pre_train_eval'] = False
+ party_config['metrics_recorder']['compute_post_train_eval'] = True
+
+ if self.nb_config['fusion_identifier'] == 'fedavgplus': # Todo: CoMed+ and GeoMed+?
+ party_config['local_training']['info']['alpha'] = alpha
+ party_config['local_training']['info']['rho'] = rho
+
+ if 'custom_data' in self.nb_config.keys():
+ party_config['data']['name'] = self.nb_config['custom_data']['name']
+ party_config['data']['path'] = self.nb_config['custom_data']['dh_path']
+ file_ext = self.nb_config['custom_data']['data_path']['party' + str(currParty)].split('.')[-1]
+ if file_ext == 'npz':
+ party_config['data']['info']['npz_file'] = \
+ self.nb_config['custom_data']['data_path']['party' + str(currParty)]
+ else:
+ party_config['data']['info']['txt_file'] = \
+ self.nb_config['custom_data']['data_path']['party' + str(currParty)]
+ if 'custom_model' in self.nb_config:
+ # assuming all generated party configs have a model section
+ dst = self.move_model_file_to_trial_dir(party_config)
+ party_config['model']['spec']['model_definition'] = dst
+
+ # Finally, write updated party config to file
+ with open(path_to_save_party_configs.replace('*', str(currParty)), 'w') as out:
+ yaml.safe_dump(party_config, out, default_flow_style=False)
+ currParty += 1
+ # Todo: support custom dataset for remote runs
+ else:
+ currParty = 0
+ for eachMachine in self.run_details['experiments'][0]['party_machines']: # there's only one trial for now
+ # Load
+ with open(path_to_save_party_configs.replace('*', str(currParty))) as stream:
+ try:
+ party_config = yaml.safe_load(stream)
+ except yaml.YAMLError as e:
+ print(e)
+ return None, None
+
+ # save IP addr and port number from the party config, into `run_details` dict, for runner's use
+ self.run_details['machines'][eachMachine]['ip_address'] = party_config['connection']['info']['ip']
+ self.run_details['machines'][eachMachine]['port_number'] = party_config['connection']['info']['port']
+ self.run_details['machines'][eachMachine]['ssh_username'] = os.getenv('USER')
+
+ if self.nb_config['record_metrics']:
+ # Metrics section to add to each party config
+ party_config['metrics_recorder'] = {}
+ party_config['metrics_recorder']['name'] = 'MetricsRecorder'
+ party_config['metrics_recorder']['path'] = 'ibmfl.party.metrics.metrics_recorder'
+ party_config['metrics_recorder']['output_file'] = '${config_dir}/metrics_party${id}'.replace(
+ '${config_dir}', trial_dir).replace('${id}', str(currParty))
+ party_config['metrics_recorder']['output_type'] = 'json'
+ party_config['metrics_recorder']['compute_pre_train_eval'] = False
+ party_config['metrics_recorder']['compute_post_train_eval'] = True
+
+ if self.nb_config['fusion_identifier'] == 'fedplus':
+ party_config['local_training']['info']['alpha'] = alpha
+
+ if 'custom_data' in self.nb_config.keys():
+ party_config['data']['name'] = self.nb_config['custom_data']['name']
+ party_config['data']['path'] = self.nb_config['custom_data']['dh_path']
+ file_ext = self.nb_config['custom_data']['data_path']['party' + str(currParty)].split('.')[-1]
+ if file_ext == 'npz':
+ party_config['data']['info']['npz_file'] = \
+ self.nb_config['custom_data']['data_path']['party' + str(currParty)]
+ else:
+ party_config['data']['info']['txt_file'] = \
+ self.nb_config['custom_data']['data_path']['party' + str(currParty)]
+ if 'custom_model' in self.nb_config:
+ # assuming all generated party configs have a model section
+ dst = self.move_model_file_to_trial_dir(party_config)
+ party_config['model']['spec']['model_definition'] = dst
+
+ # Finally, write updated party config to file
+ with open(path_to_save_party_configs.replace('*', str(currParty)), 'w') as out:
+ yaml.safe_dump(party_config, out, default_flow_style=False)
+
+ currParty += 1
+
+ print('Updated Party configs at {}'.format(path_to_save_party_configs))
+
+ self.nb_config['local_conf_dir'] = str(os.path.dirname(path_to_save_agg_configs))
+
+ return path_to_save_agg_configs, path_to_save_party_configs
+
+ def move_uploaded_files_to_trial_dir(self, trial_directory):
+ # trial dir was created in caller, so skipping check
+
+ # move provided dataset files:
+ dst = os.path.join(trial_directory, 'datasets')
+ if not os.path.exists(dst):
+ os.makedirs(dst)
+ for key in self.nb_config['custom_data']['data_path']:
+ src = self.nb_config['custom_data']['data_path'][key]
+ os.rename(src, os.path.join(dst, src.split('/')[-1]))
+ print('Moved {} to {}'.format(src, dst))
+
+ # update path in nb_config dict:
+ self.nb_config['custom_data']['data_path'][key] = os.path.join(dst, src.split('/')[-1])
+ # move provided datahandler file, as it doesn't get moved by runner
+ src_dh = self.nb_config['custom_data']['dh_path']
+ dst_dh = os.path.join(dst, src_dh.split('/')[-1])
+ os.rename(src_dh, dst_dh)
+ print('Moved {} to {}'.format(src_dh, dst_dh))
+
+ # update path in nb_config dict:
+ self.nb_config['custom_data']['dh_path'] = dst_dh
+
+ def move_model_file_to_trial_dir(self, some_config):
+ existing_model_def = some_config['model']['spec']['model_definition']
+ if os.path.isdir(existing_model_def):
+ # as in the case of TF2
+ existing_model_file_path = existing_model_def
+ # move user provided model file here
+ src = self.nb_config['custom_model']
+ # print('user provided model_file_path:', src)
+ # look for assets/, variables/ and saved_model.pb files in this folder
+ assets_dir = os.path.join(src, 'assets')
+ variables_dir = os.path.join(src, 'variables')
+ model_file_path = os.path.join(src, 'saved_model.pb')
+ # copy each of them
+ from distutils.dir_util import copy_tree
+ copy_tree(assets_dir, os.path.join(existing_model_file_path, 'assets'))
+ copy_tree(variables_dir, os.path.join(existing_model_file_path, 'variables'))
+
+ from shutil import copyfile
+ copyfile(model_file_path, os.path.join(existing_model_file_path, 'saved_model.pb'))
+ print('Contents of {} written to {}'.format(src, existing_model_file_path))
+
+ # return path for the model file to update respective config (same as before in this case)
+ return existing_model_file_path
+
+ else:
+ # for .h5, .pt, .pickle files
+ existing_model_file_path = existing_model_def[:existing_model_def.rfind('/')]
+ # remove existing model file
+ if os.path.exists(existing_model_def):
+ os.remove(existing_model_def)
+ # move user provided model file here
+ dst = existing_model_file_path
+ # print('dst:', dst)
+ src = self.nb_config['custom_model']
+ # print('src:', src)
+ dst = os.path.join(dst, src.split('/')[-1])
+ from shutil import copyfile
+ copyfile(src, dst)
+ print('Moved {} to {}'.format(src, dst))
+
+ # return new path for the model file to update respective config
+ return dst
diff --git a/experiment_manager/dashboard_ui.py b/experiment_manager/dashboard_ui.py
new file mode 100644
index 0000000..9d7d4d1
--- /dev/null
+++ b/experiment_manager/dashboard_ui.py
@@ -0,0 +1,712 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
+import os
+import shutil
+import sys
+sys.path.append('../')
+import experiment_manager.ibmfl_cli_automator.run as ibmfl_runner
+import config_manager
+
+import json
+from json import JSONDecodeError
+import yaml
+import ast
+
+# widget imports
+from IPython.display import display
+from ipywidgets import Dropdown, Layout, IntSlider, Label, Box, VBox, HTML, HBox, Output, Textarea, Button, Text, \
+ RadioButtons
+
+
+class DashboardUI:
+ """
+ The DashboardUI class contains all widgets required in the dashboard, as well as their event handler methods.
+ """
+
+
+ def __init__(self):
+ self.mgr = config_manager.ConfigManager()
+ self.exp_runner = ibmfl_runner.Runner()
+
+ self.params_widgets = []
+ self.hyperparams_dict = {}
+
+ def generate_model_dataset_fusion_ui(self):
+ ui_model_choices = self.mgr.df.model_ui.unique()
+ model_header = HTML(
+ value='<{size}>Model details'.format(size='h4'),
+ layout=Layout(width='auto', grid_area='model_header'))
+
+ # Model Selection:
+ model_dropdown = Dropdown(
+ options=['Choose your model'] + list(ui_model_choices),
+ description='Model:',
+ disabled=False,
+ layout=Layout(width='60%', grid_area='model_dr')
+ )
+
+ def model_dropdown_eventhandler(change):
+ model_chosen = change.new
+ self.mgr.nb_config['model'] = model_chosen
+ # metrics are only supported for (some) Keras models
+ if model_chosen == 'Keras':
+ metrics_or_not.children[1].disabled = False
+ else:
+ metrics_or_not.children[1].value = 'No'
+ metrics_or_not.children[1].disabled = True
+ self.mgr.nb_config['record_metrics'] = False
+
+ model_dropdown.observe(model_dropdown_eventhandler, names='value')
+
+ dataset_header = HTML(value='<{size}>Dataset details'.format(size='h4'),
+ layout=Layout(width='auto', grid_area='dataset_header'))
+
+ # path for model file
+ custom_model_filepath = Text(value='', placeholder='Paste path to model file (optional)',
+ description='Model File:', grid_area='custom_model')
+
+ interim_dir = os.getcwd() # gets moved to staging_dir later
+ data_dir = os.path.join(interim_dir, 'uploadedFiles')
+ if os.path.exists(data_dir):
+ shutil.rmtree(path=data_dir)
+ os.makedirs(data_dir)
+
+ def upload_model_path_handler(change):
+ model_file_path = change.new
+ # print(model_file_path + ' received!')
+ if not os.path.exists(model_file_path):
+ print(model_file_path + ' does not exist!')
+ return
+
+ if os.path.isfile(model_file_path):
+ # .h5, .pt or .pickle files
+ filename = model_file_path.split('/')[-1]
+ # copy model file to data_dir:
+ from shutil import copyfile
+ # copy model file to this dir
+ copyfile(model_file_path, os.path.join(data_dir, filename))
+ print(filename + ' written to ' + data_dir + '/' + filename)
+ self.mgr.nb_config['custom_model'] = os.path.join(data_dir, filename)
+ elif os.path.isdir(model_file_path):
+ # TF SavedModel format uses a directory
+ # copy dir to data_dir:
+ assets_dir = os.path.join(model_file_path, 'assets')
+ variables_dir = os.path.join(model_file_path, 'variables')
+ model_file = os.path.join(model_file_path, 'saved_model.pb')
+ dirname = model_file_path.split('/')[-1]
+ if os.path.isdir(assets_dir) and os.path.isdir(variables_dir) and os.path.isfile(model_file):
+ # adheres to TF SavedModel format, so we copy
+ from distutils.dir_util import copy_tree
+ copy_tree(model_file_path, os.path.join(data_dir, dirname))
+ print(model_file_path + ' written to ' + data_dir + '/' + dirname)
+ self.mgr.nb_config['custom_model'] = os.path.join(data_dir, dirname)
+
+ custom_model_filepath.observe(upload_model_path_handler, names='value')
+
+ dataset_dropdown = Dropdown(
+ options=['Choose your dataset'], # + determine_allowed_datasets(),
+ description='Dataset:',
+ disabled=False,
+ layout=Layout(width='80%', grid_area='dataset')
+ )
+
+ def update_supported_datasets(change):
+ model_chosen = change.new
+ rows_for_model = self.mgr.df[self.mgr.df.model_ui == model_chosen]
+ dataset_dropdown.options = list(rows_for_model['dataset'].unique())
+
+ model_dropdown.observe(update_supported_datasets, 'value')
+
+ def dataset_dropdown_eventhandler(change):
+ dataset_chosen = change.new
+ self.mgr.nb_config['dataset'] = dataset_chosen
+
+ dataset_dropdown.observe(dataset_dropdown_eventhandler, names='value')
+
+ # Data Splitting Strategy:
+ splitting_dropdown = Box([
+ Label(
+ value='Data Split:',
+ layout=Layout(width='auto')
+ ),
+ Dropdown(
+ options=['Uniform Random Sampling', 'Stratified Sampling (per source class)'],
+ disabled=False,
+ layout=Layout(width='auto'),
+ value='Uniform Random Sampling'
+ )
+ ], grid_area='dataset_spl')
+
+ def splitting_dropdown_eventhandler(change):
+ split_chosen = change.new
+ self.mgr.nb_config['split']['method'] = split_chosen
+
+ splitting_dropdown.children[1].observe(splitting_dropdown_eventhandler, names='value')
+
+ # Points per party when splitting data:
+ points_slider = Box([
+ Label(
+ value='Points from each party:',
+ layout=Layout(width='auto')
+ ),
+ IntSlider(
+ min=100,
+ max=1000,
+ layout=Layout(width='50%'),
+ value=100
+ )
+ ], grid_area='ppp')
+
+ def points_slider_eventhandler(change):
+ # print(change)
+ ppp = change.new
+ self.mgr.nb_config['split']['ppp'] = ppp
+
+ points_slider.children[1].observe(points_slider_eventhandler, names='value')
+
+ # Add choice to bring custom dataset:
+ custom_data = Box([
+ HTML(value='<{size}>OR'.format(size='h4'),
+ layout=Layout(width='25%')),
+ HTML(value='<{size}>Custom Dataset?'.format(size='h4'),
+ layout=Layout(width='35%')),
+ RadioButtons(options=['Yes', 'No'],
+ value='No',
+ disabled=False,
+ layout=Layout(width='40%'))
+ ], layout=Layout(width='100%', height='100%'), grid_area='custom_data')
+
+ custom_data_html = HTML(
+ value='<{size} style="color:red;">Choosing Yes requires you to provide a custom data handler and party '
+ 'data files'.format(
+ size='h5'),
+ layout=Layout(width='auto', grid_area='custom_data_html'))
+
+ def custom_data_handler(change):
+ if 'custom_data' not in self.mgr.nb_config:
+ self.mgr.nb_config['custom_data'] = {}
+ custom_data.children[2].disabled = True
+ if change.new == 'Yes':
+ # disable built-in dataset UI widgets, purge their key/value from the config dict
+ dataset_dropdown.disabled = True
+ splitting_dropdown.children[1].disabled = True
+ points_slider.children[1].disabled = True
+ self.mgr.nb_config.pop('split', None)
+ self.mgr.nb_config.pop('dataset', None)
+ dh_path = os.path.join(os.getcwd(), 'custom_data_handler.py')
+ self.mgr.nb_config['custom_data']['dh_path'] = dh_path
+ # get class name from data handler file:
+ as_tree = ast.parse(open(dh_path).read())
+ classes = []
+ for i in as_tree.body:
+ if isinstance(i, ast.ClassDef):
+ classes.append(i.name)
+ if len(classes) == 1:
+ print('Found class {} in the data handler provided!'.format(classes[0]))
+ self.mgr.nb_config['custom_data']['name'] = classes[0]
+ else:
+ print('Found {} class(es) in the data handler provided, expected exactly 1. Aborting!'
+ .format(len(classes)))
+
+ # else: # no need as the widget is disabled after interaction
+ # dataset_dropdown.disabled = False
+ # splitting_dropdown.children[1].disabled = False
+ # points_slider.children[1].disabled = False
+ #
+ # # purge custom_data dict
+ # self.mgr.nb_config.pop('custom_data', None)
+
+ custom_data.children[2].observe(custom_data_handler, 'value')
+
+ fusion_dropdown = Box([
+ HTML(value='<{size}>Fusion Algorithm'.format(size='h4'),
+ layout=Layout(width='auto')),
+ Dropdown(
+ options=['Choose your Fusion Algorithm'], disabled=False,
+ layout=Layout(width='auto'))
+ ], grid_area='fusion_dr')
+
+ def update_potential_fusion_algorithm(change):
+ model_chosen = self.mgr.nb_config['model']
+ if 'custom_data' in self.mgr.nb_config:
+ potential_algo = list(self.mgr.df[(self.mgr.df.model_ui == model_chosen)]['fusion_algo'].unique())
+ else:
+ dataset_chosen = self.mgr.nb_config['dataset']
+ potential_algo = list(self.mgr.df[(self.mgr.df.model_ui == model_chosen) &
+ (self.mgr.df.dataset == dataset_chosen)]['fusion_algo'].unique())
+ fusion_dropdown.children[1].options = potential_algo
+
+ model_dropdown.observe(update_potential_fusion_algorithm, 'value')
+ dataset_dropdown.observe(update_potential_fusion_algorithm, 'value')
+
+ def fusion_dropdown_eventhandler(change):
+ fusion_algo_chosen = change.new
+ self.mgr.nb_config['fusion'] = fusion_algo_chosen
+
+ fusion_dropdown.children[1].observe(fusion_dropdown_eventhandler, names='value')
+
+ metrics_or_not = Box([
+ HTML(value='<{size}>Record Metrics?'.format(size='h4'),
+ layout=Layout(width='45%')),
+ RadioButtons(options=['Yes', 'No'],
+ value='No',
+ disabled=False,
+ layout=Layout(width='20%')),
+ HTML(value='<{size}>May not be supported for all models'.format(size='h5'),
+ layout=Layout(width='35%'))
+ ], layout=Layout(width='100%', height='100%'), grid_area='metrics_choice')
+
+ def metrics_choice_handler(change):
+ metrics_or_not.children[1].disabled = True
+ if change.new == 'Yes':
+ self.mgr.nb_config['record_metrics'] = True
+
+ metrics_or_not.children[1].observe(metrics_choice_handler, names='value')
+
+ return model_header, model_dropdown, custom_model_filepath, dataset_header, dataset_dropdown, \
+ splitting_dropdown, points_slider, custom_data, custom_data_html, fusion_dropdown, metrics_or_not
+
+ def generate_parties_hyperparams_ui(self):
+ header_parties = HTML(value='<{size}>Participants'.format(size='h4'),
+ layout=Layout(width='auto', grid_area='header_parties'))
+
+ num_parties = Box([
+ Label(
+ value='Number of parties:',
+ layout=Layout(width='auto')
+ ),
+ IntSlider(
+ min=2,
+ max=100,
+ value=5,
+ layout=Layout(width='50%')
+ )
+ ], grid_area='parties')
+
+ def num_parties_eventhandler(change):
+ # print(change)
+ parties = change.new
+ self.mgr.nb_config['parties'] = parties
+
+ num_parties.children[1].observe(num_parties_eventhandler, names='value')
+
+ parties_in_quorum = Box([
+ Label(
+ value='Number of parties in quorum',
+ layout=Layout(width='auto')
+ ),
+ IntSlider(
+ min=2,
+ max=5,
+ value=5,
+ layout=Layout(width='50%')
+ )
+ ], grid_area='parties')
+
+ # quorum can have atmost all parties
+ def update_quorum_range(*args):
+ parties_in_quorum.children[1].max = num_parties.children[1].value
+ parties_in_quorum.children[1].value = num_parties.children[1].value
+
+ num_parties.children[1].observe(update_quorum_range, 'value')
+
+ def parties_in_quorum_eventhandler(change):
+ # print(change)
+ quorum = change.new
+ self.mgr.nb_config['quorum'] = round(quorum / float(self.mgr.nb_config['parties']), 2)
+
+ parties_in_quorum.children[1].observe(parties_in_quorum_eventhandler, names='value')
+
+ header_hyperparams = HTML(value='<{size}>Hyperparameters'.format(size='h4'),
+ layout=Layout(width='auto', grid_area='header_hyperparams'))
+
+ confirmation_box = Box()
+
+ hyperparams_text = Box()
+
+ self.determine_hyperparams()
+ self.params_widgets.clear()
+ self.generate_hyperparam_ui()
+ hyperparams_text.children = self.params_widgets
+
+ def confirmation_button_handler(b):
+ b.disabled = True
+ b.description = 'Hyperparams Saved'
+ num_parties.children[1].disabled = True
+ parties_in_quorum.children[1].disabled = True
+ for i in range(len(hyperparams_text.children)):
+ hyperparams_text.children[i].disabled = True
+
+ for widget in self.params_widgets:
+ self.mgr.nb_config[widget.description] = widget.value
+
+ confirm_butn = Button(
+ description='Confirm Hyperparameters',
+ disabled=False,
+ button_style='warning',
+ tooltip='Saves the hyperparameter changes',
+ layout=Layout(width='auto', height='40px'))
+
+ confirmation_box.children = (confirm_butn,)
+ [confirmation_box.children[i].on_click(confirmation_button_handler) for i in
+ range(len(confirmation_box.children))]
+
+ return header_parties, num_parties, parties_in_quorum, header_hyperparams, confirmation_box, hyperparams_text
+
+ def determine_hyperparams(self):
+ if 'custom_data' in self.mgr.nb_config:
+ exp_df = self.mgr.df[(self.mgr.df.model_ui == self.mgr.nb_config['model']) &
+ (self.mgr.df.fusion_algo == self.mgr.nb_config['fusion'])
+ ]
+ else:
+ exp_df = self.mgr.df[(self.mgr.df.model_ui == self.mgr.nb_config['model']) &
+ (self.mgr.df.dataset == self.mgr.nb_config['dataset']) &
+ (self.mgr.df.fusion_algo == self.mgr.nb_config['fusion'])
+ ]
+ if len(exp_df) != 1:
+ # pick the first matching fusion algorithm
+ # print('Found multiple matches, picking the first one')
+ firstMatch = exp_df.iloc[0]
+ # print(firstMatch)
+ self.mgr.nb_config['fusion_identifier'] = firstMatch[0]
+ else:
+ # print(exp_df)
+ self.mgr.nb_config['fusion_identifier'] = list(exp_df.fusion_identifier)[0]
+
+ # print('fusion_id:', self.mgr.nb_config['fusion_identifier'])
+ model_hyperparams_key = self.mgr.nb_config['fusion_identifier'] + '_' + self.mgr.uimodel_modelid_dict[
+ self.mgr.nb_config['model']] # to get hyperparams from df
+ self.hyperparams_dict = \
+ self.mgr.df_hyperparams[
+ self.mgr.df_hyperparams['model_identifier'] == model_hyperparams_key].hyperparams.values[0]
+
+ def generate_hyperparam_ui(self):
+ # every model has at most two keys: global and local:
+ # print(self.hyperparams_dict)
+ params_dict = self.hyperparams_dict
+
+ def inner_generate_hyperparam_ui(params_dict):
+ for key in params_dict:
+ if type(params_dict[key]) == 'dict':
+ inner_generate_hyperparam_ui(params_dict[key])
+ else:
+ self.params_widgets.append(Textarea(description=key, value=str(params_dict[key]),
+ layout=Layout(width='400px', height='100px'),
+ grid_area='hyperparams'))
+ inner_generate_hyperparam_ui(params_dict)
+
+ def generate_local_remote_ui(self):
+ local_or_remote = Box([
+ HTML(value='<{size}>Run this experiment locally or on remote machines?'.format(size='h4'),
+ layout=Layout(width='auto')),
+ Dropdown(
+ options=['Choose your option', 'Run Locally', 'Run on Remote Machines'],
+ description='',
+ disabled=False,
+ layout=Layout(width='200px')
+ )
+ ])
+
+ def network_details_tracker(change):
+ value = change.new
+ subkey = change.owner.description.split(':')[0].replace(' ', '_').lower()
+ machine_key = change.owner.placeholder.split(' ')[-1]
+ # update the run_details dict, depending on whether it already has some details:
+ if len(self.mgr.run_details['machines'][machine_key].keys()) == 0:
+ temp_dict = {}
+ temp_dict[subkey] = value
+ self.mgr.run_details['machines'][machine_key] = temp_dict
+ else:
+ temp_dict = self.mgr.run_details['machines'][machine_key]
+ temp_dict[subkey] = value
+ self.mgr.run_details['machines'][machine_key] = temp_dict
+
+ def get_IPaddr_port(party_index=None):
+ placeholder_suffix = ' for machine' + str(party_index)
+
+ ip_addr = Text(value='', placeholder='IP Address' + placeholder_suffix, description='IP Address:')
+ port_num = Text(value='', placeholder='Port Number' + placeholder_suffix, description='Port Number:')
+ ssh_user = Text(value='', placeholder='ssh username' + placeholder_suffix, description='SSH Username:')
+
+ machine_detail_vbox = VBox(children=[ip_addr, port_num, ssh_user])
+ [machine_detail_vbox.children[i].observe(network_details_tracker, 'value') for i in
+ range(len(machine_detail_vbox.children))]
+ return machine_detail_vbox
+
+ def path_details_tracker(change):
+ value = change.new
+ subkey = change.owner.description.split(':')[0].replace(' ', '_').lower()
+ if 'local' in change.owner.placeholder:
+ # this is a local path, put within `experiments` key
+ local_subkey = 'local_' + subkey
+ self.mgr.run_details['experiments'][0][local_subkey] = value # there's only one trial for now
+ else:
+ # this is a machine path
+ # update the run_details dict, depending on whether it already has some details:
+ machine_key = change.owner.placeholder.split(' ')[-1] # to figure which machine is this for
+ if len(self.mgr.run_details['machines'][machine_key].keys()) == 0:
+ temp_dict = {}
+ temp_dict[subkey] = value
+ self.mgr.run_details['machines'][machine_key] = temp_dict
+ else:
+ temp_dict = self.mgr.run_details['machines'][machine_key]
+ temp_dict[subkey] = value
+ self.mgr.run_details['machines'][machine_key] = temp_dict
+
+ def get_paths(party_index=None):
+ if party_index is None:
+ placeholder_suffix = ' for local machine'
+ else:
+ placeholder_suffix = ' for machine' + str(party_index)
+
+ config_path = Text(value='', placeholder='Staging Dir' + placeholder_suffix, description='Staging Dir:')
+ code_path = Text(value='', placeholder='IBMFL Dir' + placeholder_suffix, description='IBMFL Dir:')
+
+ machine_detail_vbox = VBox(children=[config_path, code_path])
+ [machine_detail_vbox.children[i].observe(path_details_tracker, 'value') for i in
+ range(len(machine_detail_vbox.children))]
+ return machine_detail_vbox
+
+ networking_deets_box = VBox()
+
+ def venv_box_isConda_handler(change):
+ if change.new == 'Yes':
+ self.mgr.run_details['machines']['venv_uses_conda'] = True
+ else:
+ self.mgr.run_details['machines']['venv_uses_conda'] = False
+
+ def venv_box_venvPath_handler(change):
+ self.mgr.run_details['machines']['venv_dir'] = change.new
+
+ def display_conda_venv_fields():
+ venv_box = HBox([
+ RadioButtons(
+ options=['No', 'Yes'],
+ description='Use conda?'
+ ),
+ Text(
+ value='',
+ placeholder='venv name',
+ description='virtual env:',
+ layout=Layout(width='300px', height='auto')
+ )
+ ])
+ venv_box.children[0].disabled = True # No support for conda: https://github.com/conda/conda/issues/7980
+ venv_box.children[0].observe(venv_box_isConda_handler, 'value')
+ venv_box.children[1].observe(venv_box_venvPath_handler, 'value')
+ return venv_box
+
+ def run_details_text_handler(change):
+ # print(change.new)
+ try:
+ self.mgr.run_details = json.loads(change.new)
+ except JSONDecodeError:
+ if change.new == '':
+ pass
+ else:
+ print('Incorrect JSON passed for remote details, check and retry!')
+
+ def machines_dropdown_eventhandler(change):
+ # print(change.new)
+ agg_machine = change.new.lower()
+ self.mgr.run_details['experiments'][0]['agg_machine'] = agg_machine # there is only one trial for now
+ party_machines = []
+ for machine in self.mgr.run_details['machines']:
+ party_machines.append(machine)
+
+ # now remove the agg machine from the dict
+ party_machines.remove(agg_machine)
+ # remove other extra keys if included
+ if 'venv_dir' in party_machines:
+ party_machines.remove('venv_dir')
+ if 'venv_uses_conda' in party_machines:
+ party_machines.remove('venv_uses_conda')
+ self.mgr.run_details['experiments'][0]['party_machines'] = party_machines # there is only one trial for now
+
+ def display_run_details(change):
+ change.owner.disabled = True
+ self.mgr.run_details['machines'] = {}
+ self.mgr.run_details['machines']['venv_uses_conda'] = False
+ self.mgr.run_details['machines']['venv_dir'] = '.venv'
+ self.mgr.run_details['experiments'] = []
+
+ temp_exp_dict = {}
+ temp_exp_dict['local_staging_dir'] = ''
+ temp_exp_dict['local_ibmfl_dir'] = ''
+ conda_fields = display_conda_venv_fields()
+
+ if 'Remote' in change.new:
+ # remote execution
+ # initialise the run_details dictionary
+ self.mgr.run_details['isLocalRun'] = False
+
+ temp_exp_dict['agg_machine'] = ''
+ temp_exp_dict['party_machines'] = []
+
+ for eachMachine in range(self.mgr.nb_config['parties'] + 1):
+ self.mgr.run_details['machines']['machine' + str(eachMachine + 1)] = {}
+ self.mgr.run_details['machines']['machine' + str(eachMachine + 1)]['ip_address'] = ''
+ self.mgr.run_details['machines']['machine' + str(eachMachine + 1)]['port_number'] = ''
+ self.mgr.run_details['machines']['machine' + str(eachMachine + 1)]['ssh_username'] = ''
+ self.mgr.run_details['machines']['machine' + str(eachMachine + 1)]['staging_dir'] = ''
+ self.mgr.run_details['machines']['machine' + str(eachMachine + 1)]['ibmfl_dir'] = ''
+
+ networking_header_1 = HTML(
+ value='<{size}>Details for remote execution: Fill details into the textbox on the left or in '
+ 'individual fields on the right'.format(size='h4'), layout=Layout(width='auto'))
+
+ run_details_box = VBox([
+ Label(value='Machine details:', layout=Layout(width='auto')),
+ Textarea(value=json.dumps(self.mgr.run_details, indent=4), layout=Layout(width='300px',
+ height='700px'))
+ ])
+ run_details_box.children[1].observe(run_details_text_handler, 'value')
+
+ networking_header_2 = HTML(value='<{size}>OR'.format(size='h3'),
+ layout=Layout(width='auto', margin='5px 15px 5px 15px'))
+
+ all_machines_tuple = ()
+ for eachMachine in range(self.mgr.nb_config['parties'] + 1):
+ machine_header = HTML(value='<{size}>Machine{id}'.format(size='h4', id=str(eachMachine + 1)))
+ temp_machine_box = VBox()
+ machine_IP = get_IPaddr_port(eachMachine + 1)
+ machine_paths = get_paths(eachMachine + 1)
+ temp_machine_box.children = (machine_header, HBox(children=[machine_IP, machine_paths]))
+ all_machines_tuple = all_machines_tuple + (temp_machine_box,)
+
+ machines_dropdown = Box([
+ Label(
+ value='Pick machine for running Aggregator:',
+ layout=Layout(width='auto')
+ ),
+ Dropdown(
+ options=[''] + ['Machine{id}'.format(id=i + 1) for i in
+ range(self.mgr.nb_config['parties'] + 1)],
+ layout=Layout(width='auto')
+ )])
+
+ machines_dropdown.children[1].observe(machines_dropdown_eventhandler, 'value')
+
+ temp_local_vbox = VBox()
+ local_header = HTML(value='<{size}>Local Directories'.format(size='h4'))
+ local_path_fields = get_paths()
+ temp_local_vbox.children = (local_header, local_path_fields)
+
+ networking_fields_vbox = VBox(layout=Layout(width='auto', border='0.5px solid black'))
+ networking_fields_vbox.children = (conda_fields,) + all_machines_tuple + (
+ machines_dropdown, temp_local_vbox,)
+ networking_deets_hbox = HBox(children=[run_details_box, networking_header_2, networking_fields_vbox])
+ # save_generate_butn.layout = Layout(width='185px', height='40px', margin='5px 50px 5px 400px')
+ networking_deets_box.children = (networking_header_1, networking_deets_hbox) # , save_generate_butn,)
+ self.mgr.run_details['experiments'].append(temp_exp_dict)
+
+ else:
+ # local execution
+ self.mgr.run_details['isLocalRun'] = True
+ temp_exp_dict['agg_machine'] = 'local0'
+ temp_exp_dict['party_machines'] = ['local{id}'.format(id=i + 1) for i in
+ range(self.mgr.nb_config['parties'])]
+
+ # setup dicts to populate IP addr and port number from generated configs later
+ self.mgr.run_details['machines']['local0'] = {}
+ for party in temp_exp_dict['party_machines']:
+ self.mgr.run_details['machines'][party] = {}
+
+ networking_header = HTML(value='<{size}>Details for local execution'.format(size='h4'),
+ layout=Layout(width='auto'))
+
+ local_paths = get_paths()
+ # save_generate_butn.layout = Layout(width='185px', height='40px', margin='5px 50px 5px 50px')
+ networking_deets_box.children = (networking_header, conda_fields, local_paths) # , save_generate_butn)
+
+ self.mgr.run_details['experiments'].append(temp_exp_dict)
+
+ local_or_remote.children[1].observe(display_run_details, 'value')
+
+ return (local_or_remote, networking_deets_box)
+
+ def generate_custom_party_data_ui(self):
+ def custom_data_filepath_handler(change):
+ # print(change)
+ party_data_filepath = change.new
+ if not os.path.exists(party_data_filepath):
+ print(party_data_filepath + ' does not exist!')
+ return
+ party_idx = change['owner'].description.split()[-1][-2]
+ filename = party_data_filepath.split('/')[-1]
+ # copy model file to data_dir
+ from shutil import copyfile
+ # copy model file to this dir
+ copyfile(party_data_filepath, os.path.join(data_dir, filename))
+ print(filename + ' written to ' + data_dir + '/' + filename)
+ self.mgr.nb_config['custom_data']['data_path']['party' + str(party_idx)] = os.path.join(data_dir, filename)
+
+ custom_data_paths = []
+ interim_dir = os.getcwd()
+ data_dir = os.path.join(interim_dir, 'uploadedFiles')
+ if not os.path.exists(data_dir):
+ os.makedirs(data_dir)
+ self.mgr.nb_config['custom_data']['data_path'] = {}
+ # path boxes for party specific files
+ for each_party in range(self.mgr.nb_config['parties']):
+ custom_data_filepath = Text(value='', placeholder='Paste path to dataset file for party',
+ description='For party{}:'.format(each_party))
+ custom_data_filepath.observe(custom_data_filepath_handler, names='value')
+ custom_data_paths.append(custom_data_filepath)
+ return custom_data_paths
+
+ def generate_display_configs_ui(self):
+ def display_configs(agg_conf_path, party_conf_path):
+ # Display aggregator and party* configs
+ display_header = HTML(value='<{size}>Configs Generated:'.format(size='h4'), layout=Layout(width='auto'))
+
+ agg_conf_header = HTML(value='<{size}>Aggregator Config'.format(size='h4'), layout=Layout(width='auto'))
+ agg_conf = Output(layout={'border': '0.5px solid black'})
+
+ # read agg config from filesystem:
+ with open(agg_conf_path) as stream:
+ try:
+ agg_config = yaml.safe_load(stream)
+ except yaml.YAMLError as e:
+ print(e)
+
+ with agg_conf:
+ display(agg_config)
+
+ party_conf_header = HTML(value='<{size}>Party0 Config'.format(size='h4'), layout=Layout(width='auto'))
+ party_conf = Output(layout={'border': '0.5px solid black'})
+
+ # read party0 from filesystem:
+ with open(party_conf_path.replace('*', '0')) as stream:
+ try:
+ party_config = yaml.safe_load(stream)
+ except yaml.YAMLError as e:
+ print(e)
+
+ # display
+ with party_conf:
+ display(party_config)
+
+ agg_box = HBox(children=[agg_conf_header, agg_conf], layout=Layout(width='auto', padding='20px'))
+ party_box = HBox(children=[party_conf_header, party_conf], layout=Layout(width='auto', padding='10px'))
+ party_disclmr_1 = HTML(
+ value='Other parties follow config similar to Party0, except connection.info.[ip,port] '
+ 'and paths', layout=Layout(width='auto'))
+ party_disclmr_2 = HTML(
+ value='Also, each party gets a separate dataset file, split from the chosen dataset',
+ layout=Layout(width='auto'))
+ config_box.children = [display_header, agg_box, party_box, party_disclmr_1, party_disclmr_2]
+
+ config_ui = Output()
+ config_box = VBox(layout=Layout(width='auto'))
+ agg_conf_path, party_conf_path = self.mgr.generate_update_configs()
+ if agg_conf_path is None or party_conf_path is None:
+ print('Error generating configs. Exiting...')
+ else:
+ display_configs(agg_conf_path, party_conf_path)
+
+ return (config_ui, config_box)
diff --git a/runner/exp_manager/hyperparams_to_models_map.json b/experiment_manager/hyperparams_to_models_map.json
similarity index 78%
rename from runner/exp_manager/hyperparams_to_models_map.json
rename to experiment_manager/hyperparams_to_models_map.json
index dd3f40a..3613e7f 100644
--- a/runner/exp_manager/hyperparams_to_models_map.json
+++ b/experiment_manager/hyperparams_to_models_map.json
@@ -9,9 +9,6 @@
"local": {
"training": {
"epochs": 3
- },
- "optimizer": {
- "lr": 0.01
}
}
}
@@ -60,6 +57,48 @@
}
}
},
+ {
+ "model_identifier": "coordinate_median_plus_tf",
+ "hyperparams": {
+ "global": {
+ "rounds": 3,
+ "termination_accuracy": 0.8,
+ "max_timeout": 600,
+ "rho": 10
+ },
+ "local": {
+ "training": {
+ "epochs": 10,
+ "batch_size": 10,
+ "alpha": 0.01
+ },
+ "optimizer": {
+ "lr": 0.003
+ }
+ }
+ }
+ },
+ {
+ "model_identifier": "geometric_median_plus_tf",
+ "hyperparams": {
+ "global": {
+ "rounds": 3,
+ "termination_accuracy": 0.8,
+ "max_timeout": 600,
+ "rho": 10
+ },
+ "local": {
+ "training": {
+ "epochs": 10,
+ "batch_size": 10,
+ "alpha": 0.01
+ },
+ "optimizer": {
+ "lr": 0.003
+ }
+ }
+ }
+ },
{ "model_identifier": "crypto_iter_avg_keras",
"hyperparams": {
"global": {
@@ -69,9 +108,6 @@
"local": {
"training": {
"epochs": 3
- },
- "optimizer": {
- "lr": 0.01
}
}
}
@@ -95,9 +131,6 @@
"training": {
"epochs": 3,
"privacy": {}
- },
- "optimizer": {
- "lr": 0.01
}
}
}
@@ -112,9 +145,6 @@
"local": {
"training": {
"epochs": 3
- },
- "optimizer": {
- "lr": 0.01
}
}
}
@@ -144,7 +174,7 @@
},
"local": {
"training": {
- "epochs": 3
+ "max_iter": 2
}
}
}
@@ -165,21 +195,22 @@
}
}
},
- { "model_identifier": "fedplus_keras",
+ { "model_identifier": "fedavgplus_tf",
"hyperparams": {
"global": {
- "rounds": 2000,
- "termination_accuracy": 0.83,
- "max_timeout": 600
+ "rounds": 3,
+ "termination_accuracy": 0.8,
+ "max_timeout": 600,
+ "rho": 1000
},
"local": {
"training": {
"epochs": 10,
"batch_size": 10,
- "alpha":1
+ "alpha": 0.01
},
"optimizer": {
- "lr": 0.0003
+ "lr": 0.003
}
}
}
@@ -208,9 +239,6 @@
"local": {
"training": {
"epochs": 3
- },
- "optimizer":{
- "lr": 0.01
}
}
}
@@ -245,7 +273,7 @@
}
}
},
- { "model_identifier": "id3_dt_none",
+ { "model_identifier": "id3_dt_None",
"hyperparams": {
"global": {
"max_depth": 3
@@ -262,9 +290,6 @@
"local": {
"training": {
"epochs": 3
- },
- "optimizer": {
- "lr": 0.01
}
}
}
@@ -323,9 +348,6 @@
"local": {
"training": {
"epochs": 3
- },
- "optimizer": {
- "lr": 0.01
}
}
}
@@ -370,9 +392,6 @@
"local": {
"training": {
"epochs": 3
- },
- "optimizer": {
- "lr": 0.01
}
}
}
@@ -408,7 +427,7 @@
}
},
{
- "model_identifier": "prej_remover_none",
+ "model_identifier": "prej_remover_None",
"hyperparams": {
"global": {
"rounds": 3,
@@ -421,7 +440,70 @@
}
}
},
- { "model_identifier": "sklearn_logclassificaton_globalrw_sklearn",
+ {
+ "model_identifier": "shuffle_iter_avg_keras",
+ "hyperparams": {
+ "global": {
+ "rounds": 3,
+ "termination_accuracy": 0.9,
+ "max_timeout": 60
+ },
+ "local": {
+ "training": {
+ "epochs": 3
+ }
+ }
+ }
+ },
+ {
+ "model_identifier": "shuffle_iter_avg_pytorch",
+ "hyperparams": {
+ "global": {
+ "rounds": 3,
+ "termination_accuracy": 0.9,
+ "max_timeout": 60
+ },
+ "local": {
+ "training": {
+ "epochs": 3,
+ "lr": 1
+ },
+ "optimizer": "optim.Adadelta"
+ }
+ }
+ },
+ {
+ "model_identifier": "shuffle_iter_avg_sklearn",
+ "hyperparams": {
+ "global": {
+ "rounds": 3,
+ "termination_accuracy": 0.9,
+ "max_timeout": 60
+ },
+ "local": {
+ "training": {
+ "max_iter": 2
+ }
+ }
+ }
+ },
+ {
+ "model_identifier": "shuffle_iter_avg_tf",
+ "hyperparams": {
+ "global": {
+ "rounds": 3,
+ "termination_accuracy": 0.9,
+ "max_timeout": 60
+ },
+ "local": {
+ "training": {
+ "epochs": 3
+ }
+ }
+ }
+
+ },
+ { "model_identifier": "sklearn_logclassification_globalrw_sklearn",
"hyperparams": {
"global": {
"rounds": 3,
@@ -462,7 +544,7 @@
}
}
},
- { "model_identifier": "xgboost_none",
+ { "model_identifier": "xgboost_None",
"hyperparams": {
"global": {
"learning_rate": 0.1,
@@ -488,9 +570,6 @@
"local": {
"training": {
"epochs": 3
- },
- "optimizer": {
- "lr": 0.01
}
}
}
@@ -525,4 +604,4 @@
}
}
}
-]
\ No newline at end of file
+]
diff --git a/runner/runner_cli.md b/experiment_manager/ibmfl_cli_automator/README.md
similarity index 74%
rename from runner/runner_cli.md
rename to experiment_manager/ibmfl_cli_automator/README.md
index c9a781a..63fcf65 100644
--- a/runner/runner_cli.md
+++ b/experiment_manager/ibmfl_cli_automator/README.md
@@ -1,28 +1,33 @@
+# IBMFL CLI Automator
+
+Here you can find scripts which allow us to easily set up and run tests for IBMFL in an automated manner. It is possible to run a series of identical trials which vary in a parameterized way without any manual command entry or otherwise. The necessary additional libraries are contained in `test_requirements.txt`.
+
+
## Getting Started
-There is a ready-to-run example for mnist in `runner/examples/mnist` as well as a reinforcement learning "pendulum" OpenAI Gym environment in `runner/examples/pendulum/`, and all the supplemental files needed to use the testing framework are there as well.
+There is a ready-to-run example for mnist in `ibmfl_cli_automator/examples/mnist`, alongwith all the supplemental files needed.
Trying out this example is as simple as running:
```
-runner/run.py runner/examples/pendulum
+ibmfl_cli_automator/run.py ibmfl_cli_automator/examples/mnist
```
-and this is the only parameter that the runner takes, namely, a path to the folder that contains the config files.
+and this is the only parameter that the automator takes, namely, a path to the folder that contains the config files.
This will print some useful info to stdout to help you track your IBMFL run's progress, including the timestamp that is also used to uniquely identify each run you trigger. It will also write a metadata file on your local machine (inside `local_staging_dir` as specified in your config file) with further info. This metadata can be used to monitor your remote jobs with further detail, or kill them if something has gone wrong, e.g.:
```
-runner/monitor.py --config ${local_staging_dir}/latest/trial1/metadata.yml kill
+ibmfl_cli_automator/monitor.py --config ${local_staging_dir}/latest/trial1/metadata.yml kill
```
If you realize you need to make a few changes to your code, you can easily sync your changes to your remote machines:
```
-runner/sync_repo.sh slaguna sylvester23.sl.cloud9.ibm.com . /data/slaguna/repos/IBMFL
+ibmfl_cli_automator/sync_repo.sh slaguna sylvester23.sl.cloud9.ibm.com . /data/slaguna/repos/IBMFL
```
It will use `rsync` to only send changed files, and will avoid sending any git, virtualenv, pycache, etc. folders that would cause problems.
-## Runner Config Parameters
+## Automator Config Parameters
A description of this config file and the supplemental files, and some guidance on how to use them is as follows:
@@ -36,9 +41,9 @@ A description of this config file and the supplemental files, and some guidance
### `config_runner.yml`
-This is the primary runner configuration file which references the others. Its main job is to specify how the *trials* (one execution of FL) will be structured. Many of the configurations here will be used as values for template parameters for the aggregator and party config files that will be generated as part of the testing framework's execution. Currently, the `${agg_ip}`, `${agg_port}`, `${party_ip}`, `${party_port}`, `${n_parties}`, and `${n_rounds}` are filled with the appropriate values from the runner config file, based on the machines and values specified for each _experiment_. An experiment is a set of identical trials that all use the same specification. The runner config file takes in a list of experiments that are executed one-by-one.
+This is the primary automator configuration file which references the others. Its main job is to specify how the *trials* (one execution of FL) will be structured. Many of the configurations here will be used as values for template parameters for the aggregator and party config files that will be generated as part of the testing framework's execution. Currently, the `${agg_ip}`, `${agg_port}`, `${party_ip}`, `${party_port}`, `${n_parties}`, and `${n_rounds}` are filled with the appropriate values from the config file, based on the machines and values specified for each _experiment_. An experiment is a set of identical trials that all use the same specification. The config file takes in a list of experiments that are executed one-by-one.
-The machines used for each experiment are specified using the `agg_machine` (value) and `party_machines` (list) keys in the runner config file. The machines are specified by their key, which can be used to obtain the machine's details from the `machines` dictionary also present in the runner config file. The keys that can be specified per machine are as follows:
+The machines used for each experiment are specified using the `agg_machine` (value) and `party_machines` (list) keys in the `config_runner.yml` file. The machines are specified by their key, which can be used to obtain the machine's details from the `machines` dictionary also present in the config file. The keys that can be specified per machine are as follows:
key | description
--------------- | -----------
@@ -54,7 +59,7 @@ There are additional experiment config values that can be specified:
key | description
------------------------ | -----------
-`local_staging_dir` | where to stage all supplementary files (data, etc) for the experiment, and where to store any output on the local machine (i.e. the machine where the runner is called)
+`local_staging_dir` | where to stage all supplementary files (data, etc) for the experiment, and where to store any output on the local machine (i.e. the machine where the automator is called)
`local_ibmfl_dir` | absolute path to IBMFL repository on local machine (currently unused)
`n_parties` | number of parties to use; can be less-than or equal-to the number in the `party_machines` list; filled into the aggregator config
`n_rounds` | number of rounds to train for; filled directly into the aggregator config
@@ -65,7 +70,7 @@ key | description
### `config_agg_tmpl.yml` and `config_party_tmpl.yml`
-These are templates for the aggregator and party configuration files. When the runner is triggered, it will fill in these template parameters with values from the runner config file. The template parameter values should be pretty self-explanatory with respect to how they are filled based on the runner config. In the future, it will be possible to easily specify your own pairs of values to fill programmatically from the runner config to the agg and party configs, so you can parameterize your experiments in any way you'd like.
+These are templates for the aggregator and party configuration files. When the automator is triggered, it will fill in these template parameters with values from the automator config file. The template parameter values should be pretty self-explanatory with respect to how they are filled based on the automator config. In the future, it will be possible to easily specify your own pairs of values to fill programmatically from the automator config to the agg and party configs, so you can parameterize your experiments in any way you'd like.
## Additional Scripts
@@ -75,7 +80,7 @@ There are some additional scripts that can (hopefully) make development and test
### `monitor.py`
-The monitor script takes a config file by named argument `--config` and an action: daemonize, list, or kill. The config is the metadata written automatically by the runner script. Or, if you want, you can write your own config, which will let you view all FL-related processes on a list of machines: see `runner/examples/metadata_global.yml` for a self-explanatory example. The action can be one of the following:
+The monitor script takes a config file by named argument `--config` and an action: `daemonize`, `list`, or `kill`. The config is the metadata written automatically by the `config_runner` script. Or, if you want, you can write your own config, which will let you view all FL-related processes on a list of machines: see `ibmfl_cli_automator/examples/metadata_global.yml` for an example. The action can be one of the following:
action | description
----------- | -----------
@@ -88,7 +93,7 @@ This can come in handy when debugging and you want to ensure no straggling proce
### `sync_repo.sh`
-This simple script takes 4 parameters in the following order: `username`, `address`, `local_dir`, and `remote_dir`, and syncs your local directory with the remote directory. Before using, __double-check that your virtual environment directory is excluded by the rsync command__ as syncing your virtual environment folders can corrupt the remote machine's virtual environment! It should ideally use the config files to automate syncing all relevant machines for a runner config file, but this preliminary version is provided for convenience anyway.
+This simple script takes 4 parameters in the following order: `username`, `address`, `local_dir`, and `remote_dir`, and syncs your local directory with the remote directory. Before using, __double-check that your virtual environment directory is excluded by the rsync command__ as syncing your virtual environment folders can corrupt the remote machine's virtual environment! It should ideally use the config files to automate syncing all relevant machines in the automator config file, but this preliminary version is provided for convenience anyway.
## Metrics and Post-Processing
@@ -99,7 +104,7 @@ The party's config file can contain a `metrics` secition which lets you load a c
Metrics are logged every round. The metrics can be observed while the run is taking place and used as real-time feedback to the user as to the run's progress.
-The runner's config contains a few options to let you configure how the metrics are used during post-processing on a per-experiment basis. The `postproc_fn` key lets you specify a function to run on the metrics collected from that experiment, and the `postproc_x_key` and `postproc_y_keys` let you configure the way that plots are generated. The signature that should be used for custom post-processing functions is written below:
+The automator's config contains a few options to let you configure how the metrics are used during post-processing on a per-experiment basis. The `postproc_fn` key lets you specify a function to run on the metrics collected from that experiment, and the `postproc_x_key` and `postproc_y_keys` let you configure the way that plots are generated. The signature that should be used for custom post-processing functions is written below:
```
def postprocess(metrics_file_tmpl, n_trials, n_parties, y_keys)
@@ -112,7 +117,7 @@ There are existing functions for generating some standard plots and also some he
### Plot-generating Functions
-These can be specified directly in the runner config.
+These can be specified directly in the automator config.
#### `gen_reward_vs_time_plots`
Produces plots of the progress of metrics provided in the `reward_keys` parameter (corresponding to the more generic `postproc_y_keys` above) versus either the round number or time (where a key for the timing parameter to be used must be specified as an optional parameter).
@@ -142,4 +147,4 @@ Because we collect one file per trial, data for each trial is initially stored s
In many cases, experiments will run multiple trials to produce confidence bounds on data that we want to measure. This helper function takes a dictionary as returned by `group_by_iter`, and aggregates over the trials (i.e. compresses the data so that analogous datapoints for individual trials are reduced to a single datapoint) using each of the functions provided. The functions are to be passed in a dictionary with a key that will serve as a label for storing the output in the same, passed-in dictionary. The functions on the dictionary should take a single argument of a list of floating-point values, and return a single floating-point value. For an example of this, please see line 266-271 of `postprocess.py`.
#### `offset_vals` and `offset_vals_cycle`
-These functions operate on the data coming straight out of `parse_party_data`. The first will offset each metric specified in `offset_keys` using each function in `offset_methods_dict`, which uses a dictionary just as in `aggregate_over_trials` to specify the functions used. In this case, the functions take a list and return another list, and transform the list using other values in the list in some way. Obvious, useful operations can be found on line 96 and line 102 of `postprocess.py`, which offset each value in the list by the first value and by the previous value, respectively. This is useful for making all timestamps reference a certain point in time, or for computing durations from timestamps. The second function, `offset_vals_cycle`, does the same thing except instead of individually operating on `offset_keys` it treats them as a sequential set of timestamps for computing durations; this is what is used in `gen_timing_plots` to produce the intra-round durations.
\ No newline at end of file
+These functions operate on the data coming straight out of `parse_party_data`. The first will offset each metric specified in `offset_keys` using each function in `offset_methods_dict`, which uses a dictionary just as in `aggregate_over_trials` to specify the functions used. In this case, the functions take a list and return another list, and transform the list using other values in the list in some way. Obvious, useful operations can be found on line 96 and line 102 of `postprocess.py`, which offset each value in the list by the first value and by the previous value, respectively. This is useful for making all timestamps reference a certain point in time, or for computing durations from timestamps. The second function, `offset_vals_cycle`, does the same thing except instead of individually operating on `offset_keys` it treats them as a sequential set of timestamps for computing durations; this is what is used in `gen_timing_plots` to produce the intra-round durations.
diff --git a/runner/examples/metadata_global.yml b/experiment_manager/ibmfl_cli_automator/examples/metadata_global.yml
similarity index 100%
rename from runner/examples/metadata_global.yml
rename to experiment_manager/ibmfl_cli_automator/examples/metadata_global.yml
diff --git a/runner/examples/metadata_localhost.yml b/experiment_manager/ibmfl_cli_automator/examples/metadata_localhost.yml
similarity index 100%
rename from runner/examples/metadata_localhost.yml
rename to experiment_manager/ibmfl_cli_automator/examples/metadata_localhost.yml
diff --git a/runner/examples/mnist/config_agg_tmpl.yml b/experiment_manager/ibmfl_cli_automator/examples/mnist/config_agg_tmpl.yml
similarity index 100%
rename from runner/examples/mnist/config_agg_tmpl.yml
rename to experiment_manager/ibmfl_cli_automator/examples/mnist/config_agg_tmpl.yml
diff --git a/runner/examples/mnist/config_party_tmpl.yml b/experiment_manager/ibmfl_cli_automator/examples/mnist/config_party_tmpl.yml
similarity index 92%
rename from runner/examples/mnist/config_party_tmpl.yml
rename to experiment_manager/ibmfl_cli_automator/examples/mnist/config_party_tmpl.yml
index 9641f6e..d72eda7 100644
--- a/runner/examples/mnist/config_party_tmpl.yml
+++ b/experiment_manager/ibmfl_cli_automator/examples/mnist/config_party_tmpl.yml
@@ -22,7 +22,7 @@ model:
name: KerasFLModel
path: ibmfl.model.keras_fl_model
spec:
- model_definition: examples/configs/iter_avg/keras/compiled_keras.h5
+ model_definition: examples/configs/keras_classifier/compiled_keras.h5
model_name: keras-cnn
protocol_handler:
name: PartyProtocolHandler
diff --git a/runner/examples/mnist/config_runner.yml b/experiment_manager/ibmfl_cli_automator/examples/mnist/config_runner.yml
similarity index 100%
rename from runner/examples/mnist/config_runner.yml
rename to experiment_manager/ibmfl_cli_automator/examples/mnist/config_runner.yml
diff --git a/experiment_manager/ibmfl_cli_automator/images/config_runner.png b/experiment_manager/ibmfl_cli_automator/images/config_runner.png
new file mode 100644
index 0000000..81addf8
Binary files /dev/null and b/experiment_manager/ibmfl_cli_automator/images/config_runner.png differ
diff --git a/experiment_manager/ibmfl_cli_automator/images/max_train.png b/experiment_manager/ibmfl_cli_automator/images/max_train.png
new file mode 100644
index 0000000..f98e18e
Binary files /dev/null and b/experiment_manager/ibmfl_cli_automator/images/max_train.png differ
diff --git a/experiment_manager/ibmfl_cli_automator/images/mean_eval.png b/experiment_manager/ibmfl_cli_automator/images/mean_eval.png
new file mode 100644
index 0000000..0ace6eb
Binary files /dev/null and b/experiment_manager/ibmfl_cli_automator/images/mean_eval.png differ
diff --git a/experiment_manager/ibmfl_cli_automator/images/mean_train.png b/experiment_manager/ibmfl_cli_automator/images/mean_train.png
new file mode 100644
index 0000000..6537377
Binary files /dev/null and b/experiment_manager/ibmfl_cli_automator/images/mean_train.png differ
diff --git a/runner/monitor.py b/experiment_manager/ibmfl_cli_automator/monitor.py
similarity index 97%
rename from runner/monitor.py
rename to experiment_manager/ibmfl_cli_automator/monitor.py
index 6fa63ed..a921f87 100755
--- a/runner/monitor.py
+++ b/experiment_manager/ibmfl_cli_automator/monitor.py
@@ -1,3 +1,9 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
#!/usr/bin/env python3
import argparse
diff --git a/runner/postprocess.py b/experiment_manager/ibmfl_cli_automator/postprocess.py
similarity index 99%
rename from runner/postprocess.py
rename to experiment_manager/ibmfl_cli_automator/postprocess.py
index d9d5397..90db1b3 100755
--- a/runner/postprocess.py
+++ b/experiment_manager/ibmfl_cli_automator/postprocess.py
@@ -1,3 +1,9 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
#!/usr/bin/env python3
from collections import OrderedDict
diff --git a/runner/run.py b/experiment_manager/ibmfl_cli_automator/run.py
similarity index 92%
rename from runner/run.py
rename to experiment_manager/ibmfl_cli_automator/run.py
index 44c6a89..86922d9 100644
--- a/runner/run.py
+++ b/experiment_manager/ibmfl_cli_automator/run.py
@@ -1,3 +1,9 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
#!/usr/bin/env python3
from copy import deepcopy
@@ -8,6 +14,7 @@
import pprint as pp
import random
import sys
+sys.path.append('../../')
import time
from string import Template
import subprocess as sp
@@ -19,13 +26,13 @@
fl_path = os.path.abspath('.')
if fl_path not in sys.path:
sys.path.append(fl_path)
-import runner.postprocess as ibmfl_postproc
+import experiment_manager.ibmfl_cli_automator.postprocess as ibmfl_postproc
# USAGE:
-# ./runner/run_paramiko.py
+# ./ibmfl_cli_automator/run_paramiko.py
#
# ASSMUMPTIONS:
-# - runner config is named config_runner.yml
+# - automator config is named config_runner.yml
# - config file templates are named config_agg_tmpl.yml, config_party_tmpl.yml
# TODO:
# X script should have args for the _dir variables, username of ssh, etc (in runner.yaml?)
@@ -37,7 +44,7 @@
class Runner:
'''
- The runner contains all the shared information about the ongoing runs, and has the capabiity of
+ The runner class contains all the shared information about the ongoing runs, and has the capability of
organizing the configuration of experiments where IBMFL runs are triggered in parameterized
ways.
'''
@@ -154,12 +161,38 @@ def __stat_on_server(client, path_remote):
sftp.close()
return ret
+ @staticmethod
+ def copy_to_server_dir(src, dst, client):
+ """
+ Write file to server, in the specific case that its a directory.
+ :param src: local path to be copied from
+ :type src: `str`
+ :param dst: destination path on the server to copy to
+ :type dst: `str`
+ :param client: file transfer client
+ :type client: `paramiko.client.SSHClient`
+ :return: None
+ """
+ sftp = client.open_sftp()
+ for item in os.listdir(src):
+ if os.path.isfile(os.path.join(src, item)):
+ sftp.put(os.path.join(src, item),
+ os.path.join(dst, item))
+ else:
+ # subdir within src
+ try:
+ sftp.mkdir(os.path.join(dst, item))
+ except IOError:
+ # print('Warn: IOError with mkdir at remote: Perhaps the directory already exists.')
+ # ignore if it exists
+ pass
+ Runner.copy_to_server_dir(os.path.join(src, item),
+ os.path.join(dst, item), client)
@staticmethod
def __copy_to_server(path_local, client, path_remote):
"""
- Write file to server
-
+ Write file to server, invokes copy_to_server_dir if path_local is a dir.
:param path_local: file on the local machine
:type path_local: `str`
:param client: remote client to copy the file to
@@ -175,11 +208,19 @@ def __copy_to_server(path_local, client, path_remote):
(status, outstr) = Runner.__exec_command_sync(client,
'mkdir -p {}'.format(path_remote_dir))
sftp = client.open_sftp()
- sftp.put(path_local,
- path_remote)
+ print('copying {} to {}'.format(path_local, path_remote))
+ if os.path.isdir(path_local):
+ if not os.path.exists(path_remote):
+ try:
+ sftp.mkdir(path_remote)
+ except IOError:
+ # suppress failure if directory exists
+ pass
+ Runner.copy_to_server_dir(path_local, path_remote, client)
+ else:
+ sftp.put(path_local, path_remote)
sftp.close()
-
@staticmethod
def __copy_from_server(path_remote, client, path_local):
"""
@@ -307,7 +348,6 @@ def __get_exec_string(machine_info, cmd, tag, ts, obtain_stdout):
venvde=deactivate_str)
return exec_string
-
def __start_agg_job(self, config_agg_dict, agg_files, trial_info, ti, ts):
"""
Start an aggregator job using the connection info supplied via the agg config
@@ -350,7 +390,7 @@ def __start_agg_job(self, config_agg_dict, agg_files, trial_info, ti, ts):
# start job on server
agg_exec_string = Runner.__get_exec_string(trial_info['agg_machine'],
- 'runner/run_agg.py',
+ 'experiment_manager/ibmfl_cli_automator/run_agg.py',
'agg',
ts,
obtain_stdout=True)
@@ -398,6 +438,7 @@ def __start_party_job(self, config_party_dict, party_files, trial_info, ti, pi,
f'{local_staging_dir}/config_party{pi}.yml',
party_client,
f'{machine_staging_dir}/config_party{pi}.yml')
+
for supp_file in party_files:
Runner.__copy_to_server(
str(supp_file),
@@ -406,7 +447,7 @@ def __start_party_job(self, config_party_dict, party_files, trial_info, ti, pi,
# start job on server
party_exec_string = Runner.__get_exec_string(trial_info['party_machines'][pi],
- 'runner/run_party.py',
+ 'experiment_manager/ibmfl_cli_automator/run_party.py',
'party{}'.format(pi),
ts,
obtain_stdout=False)
@@ -609,13 +650,13 @@ def get_trial_info(self, exp_info, machines, trial_nr):
def __copy_logs_to_local(machine_staging_dir, client, local_staging_dir, tag):
"""
Copy the stdout and stderr files that the aggregator and party jobs produce back to the
- local runner machine. Uses the passed-in paramiko client.
+ local automator machine. Uses the passed-in paramiko client.
:param machine_staging_dir: staging dir on agg/party machine
:type machine_staging_dir: `string`
:param client: the remote machine client to use to obtain the logs
:type client: paramiko.client.SSHClient
- :param local_staging_dir: staging dir on runner machine
+ :param local_staging_dir: staging dir on automator machine
:type local_staging_dir: `string`
:param tag: the agg/party string corresponding to the client process (i.e. 'agg', 'party0')
:type tag: `string`
@@ -633,7 +674,7 @@ def __copy_logs_to_local(machine_staging_dir, client, local_staging_dir, tag):
def get_metrics_filepath(self):
"""
- Extract the templated filepath to the metrics file on the runner's machine;
+ Extract the templated filepath to the metrics file on the automator's machine;
used throughout to ensure a consistent location is used for each trials' metrics files.
Leverages instance variables that store the party configs for the current trial.
@@ -652,7 +693,7 @@ def get_metrics_filepath(self):
def __copy_metrics_to_local(self, party_clients):
"""
- Copy the metrics files that the party jobs produce back to the local runner machine.
+ Copy the metrics files that the party jobs produce back to the local automator machine.
Uses the passed-in paramiko clients.
:param party_clients: the remote machine client to use to obtain the logs
@@ -792,9 +833,19 @@ def dict_set_nested(nested_dict, key_list, value):
continue
l_filepath = local_trial_pobj.joinpath(g_filepath.name)
m_filepath = machine_trial_pobj.joinpath(g_filepath.name)
- # move the file to our local trial dir
+ # copy the file/dir to our local trial dir
if not l_filepath.is_file() and 'output' not in k:
- g_filepath.rename(l_filepath)
+ from shutil import copytree, copyfile
+ if g_filepath.is_dir():
+ try:
+ copytree(g_filepath, l_filepath)
+ except IOError:
+ # print('Warn: IOError when copying {} to {}. Perhaps dir already exists'.format(g_filepath,
+ # l_filepath))
+ # suppress error if directory exists
+ pass
+ else:
+ copyfile(g_filepath, l_filepath)
# don't plan to scp it to the machine if it's an output file
if 'output' not in k:
proc_file_map[proc_label] += [l_filepath]
@@ -806,7 +857,6 @@ def dict_set_nested(nested_dict, key_list, value):
return proc_file_map
-
def run_trial(self, trial_info, config_agg, config_parties, ts, ui_mode='nb'):
"""
Run a trial, the thing we vary are the config dicts
@@ -821,7 +871,7 @@ def run_trial(self, trial_info, config_agg, config_parties, ts, ui_mode='nb'):
:type config_parties: `list[dict]`
:param ts: pseudo-unique generated timestamp for this trial
:type ts: `str`
- :param ui_mode: tells us whether we're triggering the runner from the notebook or CLI
+ :param ui_mode: tells us whether we're triggering the automator from the notebook or CLI
:type ui_mode: `str`
:return: None
"""
@@ -976,7 +1026,7 @@ def run_trial(self, trial_info, config_agg, config_parties, ts, ui_mode='nb'):
def convert_machine_dict_from_nb_to_cli(self, machines):
"""
Helper function for converting the slightly-different format for specifying the machines
- that the notebook uses into the format that the runner expects (same as the CLI mode)
+ that the notebook uses into the format that the automator expects (same as the CLI mode)
:param machines: list of machines that are available for use during the IBMFL runs
:type machines: `dict`
@@ -1013,7 +1063,7 @@ def run_experiment(self, exp_info, machines, config_agg, config_parties, ui_mode
:param config_parties: info for party machines as per either a list of config_partyN.yml files
(already filled via e.g. the user or the notebook UI) or via the `config_party_tmpl.yml` file
:type config_parties: `list[dict]`
- :param ui_mode: tells us whether we're triggering the runner from the notebook or CLI
+ :param ui_mode: tells us whether we're triggering the automator from the notebook or CLI
:type ui_mode: `str`
:param ts: timestamp to use as a label for this experiment
:type ts: `str`
@@ -1081,7 +1131,7 @@ def get_postproc_fn(self):
:return: the postprocessing function (should have a specific signature as per the README)
:rtype: function handle
"""
- return getattr(sys.modules['runner.postprocess'], self.__exp_info['postproc_fn'])
+ return getattr(sys.modules['experiment_manager.ibmfl_cli_automator.postprocess'], self.__exp_info['postproc_fn'])
def call_postproc_fn(self):
@@ -1133,10 +1183,10 @@ def call_postproc_fn(self):
exp_runner.run_experiment(exp_info, machines, config_agg_str, config_party_str, ui_mode='cli')
exp_runner.call_postproc_fn()
- #metrics_dict = exp_runner.get_experiment_output()
+ # metrics_dict = exp_runner.get_experiment_output()
# do experiment-specific postprocessing
- #if metrics_dict:
+ # if metrics_dict:
# pp.pprint(metrics_dict)
# ibmfl_postproc.plot_reward_vs_time(metrics_dict,
# ['post_train:eval:loss',
diff --git a/runner/run_agg.py b/experiment_manager/ibmfl_cli_automator/run_agg.py
similarity index 95%
rename from runner/run_agg.py
rename to experiment_manager/ibmfl_cli_automator/run_agg.py
index 4428645..1140416 100644
--- a/runner/run_agg.py
+++ b/experiment_manager/ibmfl_cli_automator/run_agg.py
@@ -1,3 +1,9 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
#!/usr/bin/env python3
import re
diff --git a/runner/run_party.py b/experiment_manager/ibmfl_cli_automator/run_party.py
similarity index 90%
rename from runner/run_party.py
rename to experiment_manager/ibmfl_cli_automator/run_party.py
index 25aa42b..38609fc 100644
--- a/runner/run_party.py
+++ b/experiment_manager/ibmfl_cli_automator/run_party.py
@@ -1,3 +1,9 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
#!/usr/bin/env python3
import re
diff --git a/runner/sync_repo.sh b/experiment_manager/ibmfl_cli_automator/sync_repo.sh
similarity index 100%
rename from runner/sync_repo.sh
rename to experiment_manager/ibmfl_cli_automator/sync_repo.sh
diff --git a/experiment_manager/images/ss_0.png b/experiment_manager/images/ss_0.png
new file mode 100644
index 0000000..703d204
Binary files /dev/null and b/experiment_manager/images/ss_0.png differ
diff --git a/experiment_manager/images/ss_1.gif b/experiment_manager/images/ss_1.gif
new file mode 100644
index 0000000..c2e8ea7
Binary files /dev/null and b/experiment_manager/images/ss_1.gif differ
diff --git a/experiment_manager/images/ss_2.gif b/experiment_manager/images/ss_2.gif
new file mode 100644
index 0000000..196a9c7
Binary files /dev/null and b/experiment_manager/images/ss_2.gif differ
diff --git a/experiment_manager/images/ss_3.png b/experiment_manager/images/ss_3.png
new file mode 100644
index 0000000..2e372dd
Binary files /dev/null and b/experiment_manager/images/ss_3.png differ
diff --git a/experiment_manager/images/ss_4.png b/experiment_manager/images/ss_4.png
new file mode 100644
index 0000000..dc38452
Binary files /dev/null and b/experiment_manager/images/ss_4.png differ
diff --git a/experiment_manager/images/ss_5.png b/experiment_manager/images/ss_5.png
new file mode 100644
index 0000000..486c355
Binary files /dev/null and b/experiment_manager/images/ss_5.png differ
diff --git a/experiment_manager/images/ss_6.gif b/experiment_manager/images/ss_6.gif
new file mode 100644
index 0000000..369e7c0
Binary files /dev/null and b/experiment_manager/images/ss_6.gif differ
diff --git a/experiment_manager/images/ss_custom_1.png b/experiment_manager/images/ss_custom_1.png
new file mode 100644
index 0000000..96bab45
Binary files /dev/null and b/experiment_manager/images/ss_custom_1.png differ
diff --git a/experiment_manager/images/ss_custom_2.png b/experiment_manager/images/ss_custom_2.png
new file mode 100644
index 0000000..468cbf6
Binary files /dev/null and b/experiment_manager/images/ss_custom_2.png differ
diff --git a/experiment_manager/supported_models.csv b/experiment_manager/supported_models.csv
new file mode 100644
index 0000000..6b0ecb7
--- /dev/null
+++ b/experiment_manager/supported_models.csv
@@ -0,0 +1,46 @@
+fusion_identifier, fusion_algo, dataset, model_spec_name, fl_model, model_ui
+coordinate_median, Coordinate-wise Median, mnist, keras-cnn, KerasFLModel, Keras
+coordinate_median, Coordinate-wise Median, mnist, pytorch-nn, PytorchFLModel, PyTorch
+coordinate_median, Coordinate-wise Median, mnist,, SklearnSGDFLModel, Scikit-learn
+coordinate_median, Coordinate-wise Median, mnist,tf-cnn, TensorFlowFLModel, TensorFlow
+coordinate_median_plus, CoordinateMedian+, mnist, tf-cnn, TensorFlowModel, TensorFlow
+differential_privacy_sgd, Differentially Private SGD, mnist, keras-dpcnn, KerasDPModel, Keras
+fedavg, Federated Averaging, mnist, keras-cnn, KerasFLModel, Keras
+fedavg, Federated Averaging, mnist, pytorch-nn, PytorchFLModel, PyTorch
+fedavg, Federated Averaging, mnist,, SklearnSGDFLModel, Scikit-learn
+fedavg, Federated Averaging, mnist, tf-cnn, TensorFlowFLModel, TensorFlow
+fedavgplus, FedAvg+, mnist, tf-cnn, TensorFlowFLModel, TensorFlow
+fedprox, FedProx, mnist, tf-cnn, TensorFlowFLModel, TensorFlow
+geometric_median_plus, GeometricMedian+, mnist, tf-cnn, TensorFlowModel, TensorFlow
+gradient_aggregation, Gradient Averaging, mnist, keras-cnn, KerasFLModel, Keras
+gradient_aggregation, Gradient Averaging, mnist, pytorch-nn, PytorchFLModel, PyTorch
+gradient_aggregation, Gradient Averaging, mnist, tf-cnn, TensorFlowFLModel, TensorFlow
+iter_avg, Iterative Averaging, mnist, keras-cnn, KerasFLModel, Keras
+iter_avg, Iterative Averaging, mnist, pytorch-nn, PytorchFLModel, PyTorch
+iter_avg, Iterative Averaging, mnist, tf-cnn, TensorFlowFLModel, TensorFlow
+iter_avg, Iterative Averaging, mnist, , SklearnSGDFLModel, Scikit-learn
+iter_avg, Iterative Averaging, adult,, SklearnSGDFLModel, Scikit-learn
+iter_avg, Iterative Averaging, cifar10, keras-cnn-cifar10, KerasFLModel, Keras
+iter_avg, Iterative Averaging, femnist, keras-cnn, KerasFLModel, Keras
+krum, Krum, mnist, keras-cnn, KerasFLModel, Keras
+krum, Krum, mnist, pytorch-nn, PytorchFLModel, PyTorch
+krum, Krum, mnist, tf-cnn, TensorFlowFLModel, TensorFlow
+pfnm, Probabilistic Federated Neural Matching, mnist, keras-fc, KerasFLModel, Keras
+pfnm, Probabilistic Federated Neural Matching, mnist, pytorch-nn, PytorchFLModel, PyTorch
+shuffle_iter_avg, Shuffled Iterative Avg, mnist, keras-cnn, KerasFLModel, Keras
+shuffle_iter_avg, Shuffled Iterative Avg, mnist, pytorch-nn, PytorchFLModel, PyTorch
+shuffle_iter_avg, Shuffled Iterative Avg, mnist,, SklearnSGDFLModel, Scikit-learn
+shuffle_iter_avg, Shuffled Iterative Avg, mnist, tf-cnn, TensorFlowFLModel, TensorFlow
+shuffle_iter_avg, Shuffled Iterative Avg, adult,, SKlearnSGDFLModel, Scikit-learn
+shuffle_iter_avg, Shuffled Iterative Avg, cifar10, keras-cnn-cifar10, KerasFLModel, Keras
+shuffle_iter_avg, Shuffled Iterative Avg, femnist, keras-cnn, KerasFLModel, Keras
+sklearn_logclassification_globalrw,Logistic Classification with Global Reweighing, adult,, SklearnSGDFLModel, Scikit-learn
+sklearn_logclassification_globalrw,Logistic Classification with Global Reweighing, compas,, SklearnSGDFLModel, Scikit-learn
+sklearn_logclassification_globalrw,Logistic Classification with Global Reweighing, german,, SklearnSGDFLModel, Scikit-learn
+sklearn_logclassification_rw,Logistic Classification with Local Reweighing, adult,, SklearnSGDFLModel, Scikit-learn
+sklearn_logclassification_rw,Logistic Classification with Local Reweighing, compas,, SklearnSGDFLModel, Scikit-learn
+sklearn_logclassification_rw,Logistic Classification with Local Reweighing, german,, SklearnSGDFLModel, Scikit-learn
+spahm,SPAHM, federated-clustering,, SklearnKMeansFLModel, Scikit-learn
+zeno,Zeno, mnist, keras-cnn, KerasFLModel, Keras
+zeno,Zeno, mnist, pytorch-nn, PytorchFLModel, PyTorch
+zeno,Zeno, mnist, tf-cnn,TensorFlowFLModel, TensorFlow
diff --git a/experiment_manager/usage_guide.md b/experiment_manager/usage_guide.md
new file mode 100644
index 0000000..03d5633
--- /dev/null
+++ b/experiment_manager/usage_guide.md
@@ -0,0 +1,122 @@
+# Experiment Manager Dashboard
+Jupyter Notebook frontend for setting up, launching, monitoring, and evaluating the results of Federated Learning experiments.
+
+
+## Usage Instructions:
+This section enumerates the steps to be followed for a typical run of the FL experiment, using the `Keras` model, with the `MNIST` dataset and `Iterative Averaging` as the fusion algorithm.
+
+Please go over the _Using Custom Datasets/Models_ section at the bottom of the page, if you plan on using a custom dataset and/or custom model.
+
+#### Step 1a: Run the notebook
+Follow the instructions in the README.md file to get the notebook up and running in your browser.
+
+The dashboard is split into multiple sections, each of which carries out various steps for running a federated learning experiment, using IBMFL. With the widgets and folding extensions installed, the dashboard should look like the following:
+
+Fig. 1: Experiment Manager Dashboard
+
+#### Step 1b: Setup imports and objects
+Run the cell under section 1.1 in the notebook, so the necessary modules are imported and the `DashboardUI` class object is initialised into the kernel.
+#### Step 2: Choose the model, pre-populated dataset and fusion algorithm
+IBMFL supports a variety of models including _Keras_, _PyTorch_, _Scikit-learn_ and _TensorFlow_. You can choose your preferred model via the dropdown as shown in the figure below. Choosing the model populates the dataset and fusion algorithm dropdowns to display only those that are compatible (as far as the built-in examples go).
+
+Fig. 2: Choosing the model, pre-populated dataset and fusion algorithm
+
+You could choose to record metrics, by clicking `Yes` for `Record Metrics?`. Note that this isn't supported for all models and so the option is greyed out when not available.
+
+The dashboard also supports custom model files and datasets. The `Model File` and `Custom Dataset?` widgets shown in Fig. 2 facilitate the same, respectively. If you plan to use your own dataset files (instead of those listed in the dropdown), and/or your own model files (only `.h5` and `.pt` files are supported for now), please refer to the Using Custom Datasets/Models section below.
+
+#### Step 3: Select number of participating parties and hyperparameters
+Next, choose the number of parties you’d like to have in the experiment, using the slider on the left. Additionally, you could use the slider on the right to choose the number of parties the aggregator will wait on (the *quorum*), when collecting responses. If left untouched, this will be equal to the number of parties chosen in the slider on the left.
+
+Fig. 3: Choosing the number of parties
+
+
+Then you may review and modify (if needed) the hyperparameters corresponding to the choices made so far.
+
+
+Finally, click on `Confirm Hyperparameters` to save the hyperparameters displayed.
+
+The cell under section 1.3.1 is only required to be run in case you chose to use your own dataset. Please refer to the Using custom dataset/model section for more details.
+
+
+#### Step 4: Run locally or on remote machines
+Next, choose whether the experiment should be run *locally*, i.e., on the same machine as the dashboard; or *remotely*, i.e., across one or more remote virtual machines.
+
+Fig. 4: Run the experiment locally or on remote machines
+
+##### Local Run:
+In case you choose the `Run Locally` option...
+##### Remote Run:
+If the `Run on Remote Machines` option, as it needs more details compared to the local run.
+
+For the remote run, the dashboard needs `IP address`, `port number`, `SSH username`, the `IBMFL Dir` (IBMFL project root directory) and a `Staging Dir` (a staging directory where all configuration files, dataset files and logs should go) − for each of the machines. Prior to filling this in, check the appropriate option depending on whether the machines use `conda`, as well as the corresponding virtual environment path.
+
+
+Fig. 5: Run the experiment locally or on remote machines
+
+
+In the dropdown labelled `Pick machine for running Aggregator:`, select the machine where the Aggregator should be run. Finally, in the `Local Directories` section, add in the directories for the local `Staging Dir` and local `IBMFL Dir`. These should exist on the same machine where the dashboard Notebook is being run.
+
+All this information can be keyed in either through the fields on the right or via a JSON as shown on the left.
+
+#### Step 5: View Aggregator and Party configs
+On executing the cell under section 1.5, the `generate_data.py` and `generate_configs.py` scripts are invoked and the resulting aggregator and party0 files are displayed, as shown here.
+
+Fig. 6: Aggregator and Party0 configs
+
+If the configs look alright, execute the next cell under section 1.6 to get things running and monitor progress.
+
+#### Step 6: Monitor experiment and visualise results
+Once the experiment begins running, details about each of the connections are printed with progress bars indicating near real-time progress made in terms of the number of rounds completed from the total as well as the number of party responses in each round.
+
+Fig. 7: Monitoring progress during the experiment
+
+
+Once the experiment completes successfully, both the bars would be green and if post-processing of metrics is supported, a button labelled `Show Charts` will be visible.
+
+On clicking the button, you'll see loss and accuracy line plots for each party.
+
+**Note:**
+
+The dashboard leverages [`tqdm`](https://github.com/tqdm/tqdm) to display progress bars for indicating real-time progress of FL jobs. While this works fine for most ML models that the dashboard supports, it doesn't work well with `sklearn` models.
+
+If you notice the progress bar(s) stuck on some value or only one of them changing, let the experiment continue and check the logs for errors. For the prepopulated datasets and built-in algorithms, there were no errors in the logs, despite the progress bars being stuck. This could be on account of timing delays between FL processes and progress bar updates and might be addressed in a subsequent release.
+
+
+---
+
+## Using Custom Datasets/Models
+
+The instructions below highlight the key things to note while following the above steps when using your own custom dataset and/or model files.
+
+Note that the supported model file formats are:
+- [x] Keras HDF5 format (`.h5` file extension)
+- [x] PyTorch models (`.pt` file extension)
+- [x] Pickled models (`.pickle` file extension) for scikit-learn models
+- [x] SavedModel directory (containing `assets/`, `saved_model.pb`, `variables/` files)
+
+#### Before Step 2:
+When bringing your own custom dataset, aside from the party specific files (`.npz` or `.csv`) you also need to provide a datahandler, in the cell under section 1.2.1. To get an idea of what is expected, you may refer the example `MnistKerasDataHandler` provided in the notebook in the cell under section 1.2.1. After you've pasted your datahandler in the cell, the magic command in the top line will save it as `custom_data_handler.py` to the file system when the cell is executed. Please make sure that while pasting the code for your datahandler, you do not overwrite the `%%writefile ...` command.
+
+
+Fig. 8: Provide datahandler, model path and choose to use a Custom Dataset
+
+Next, you may follow the instructions for Step 2 as given in the previous section, with the exception to choose:
+- `Yes` for `Custom Dataset?`
+- Optionally, you may want to provide the path to your model file in the `Model File:` text box.
+
+The next step wherein number of parties are chosen and the hyperparameters finalised, stays the same.
+
+#### Before Step 4:
+The cell under section 1.3.1 is where you need to provide the absolute paths to the custom dataset files for each of the parties in the federated learning job. On executing the cell, you'll see as many textboxes as the number of parties. Please paste the absolute paths for the custom dataset files (`.npz` or `.csv`) in the respective textboxes. Ensure that the file extensions used comply with what the datahandler (provided in the cell under section 1.2.1) expects.
+
+
+Fig. 9: Provide party-specific custom dataset files
+
+The `uploadedFiles` folder is created by the dashboard for managing the various artifacts provided by the user.
+You can check that the files you provided are indeed showing up in the `uploadedFiles` directory.
+
+Once this is done, you may follow through the remaining steps− for choosing to run locally or on remote machines, generating config files and running the experiment− from the previous section itself.
+
+---
+---
diff --git a/federated-learning-lib/federated_learning_lib-1.0.5-py3-none-any.whl b/federated-learning-lib/federated_learning_lib-1.0.5-py3-none-any.whl
deleted file mode 100644
index 6a22638..0000000
Binary files a/federated-learning-lib/federated_learning_lib-1.0.5-py3-none-any.whl and /dev/null differ
diff --git a/federated-learning-lib/federated_learning_lib-1.0.6-py3-none-any.whl b/federated-learning-lib/federated_learning_lib-1.0.6-py3-none-any.whl
new file mode 100644
index 0000000..a334a72
Binary files /dev/null and b/federated-learning-lib/federated_learning_lib-1.0.6-py3-none-any.whl differ
diff --git a/openshift_fl/README.md b/openshift_fl/README.md
new file mode 100644
index 0000000..9ee1041
--- /dev/null
+++ b/openshift_fl/README.md
@@ -0,0 +1,68 @@
+# IBMFL Multi-Cloud OpenShift Orchestrator
+
+IBMFL Multi-Cloud and Hybrid Cloud Orchestrator automates the deployment and monitoring of aggregator and party process using federated learning library docker image on OpenShift clusters which are setup on different cloud data center regions. IBMFL OpenShift Orchestrator works on either single or multi clusters.
+
+## OpenShift Orchestrator Features
+
+- Authentication using kubeconfig file to multiple OpenShift Clusters
+- Creation and deployment of aggregator and party pods to OpenShift Clusters
+- Network communication between aggregator and party process using OpenShift Routes
+- Runs experiment by coordinating execution of training commands between aggregator and party process
+- Support for multiple trials and parallelization of multiple experiments
+- Logs and metrics capture for experiments
+
+## OpenShift Orchestrator usage
+Please refer to `examples/iter_avg_openshift` folder on how to use IBMFL OpenShift Orchestrator to run IBMFL experiments. In the example, follow instructions in `README.md` and `README_Multiple_Exp.md` to run single and multiple federated experiments respectively.
+To run orchestrator on your laptop, you can install the OpenShift community edition like OKD (https://www.okd.io/) or minshift (https://www.okd.io/minishift/).
+
+## OpenShift Orchestrator Config Description
+Please refer to `config_openshift_sample.yml` file for various keys of config file. The config file consists of two main sections: `cluster` and `experiments`.
+
+- `cluster` contains the openshift cluster configuration for aggregator and party pods.
+
+key | description | mandatory|default_value
+------------ | ----------- |----------|------------
+`agg_pod:cpu` | cpu for aggregator pod | no | 2
+`agg_pod:mem`| memory for aggregator pod | no | 4Gi
+`party_pod:cpu` | cpu for party pod | no | 2
+`party_pod:mem`| memory for party pod | no | 4Gi
+`kube_config_location`| location of kube config file | no | ~/.kube/config
+
+- `experiments` contains the configuration for federated learning experiments.
+
+ `default` contains the default settings for all experiments in the list.
+
+key | description | mandatory|default_value
+------------ | ----------- |----------|------------
+`exec_mode` | run experiments in sequential or parallel mode, options: `[seq,parallel]` | no | seq
+`image_name`| Name of IBMFL image installed in OpenShift image streams | no | ibmfl:latest
+`commands:aggregator` | FL commands to execute from aggregator pod | no | ['START', 'TRAIN', 'EVAL', 'STOP']
+
+Each item in the experiment list consist of following keys
+
+key | description | mandatory|default_value
+------------ | ----------- |----------|------------
+`staging_dir` | local directory where experiment config and data files are stored | yes |
+`name`| unique name for experiment | no | ibmfl
+`num_trials` | number of trials for experiment | no | 1
+`cluster_list` | List of OpenShift clusters connection details | yes |
+`data:pvc_name` | persistent volume claim name for COS bucket which holds the datasets and model artifacts | no |
+
+For COS bucket access, please set up Persistent Volume (PV) and Persistent Volume Claim (PVC) in OpenShift Cluster and provide the PVC name in `data:pvc_name` key.
+
+Each item in the cluster list consist of following keys
+
+key | description | mandatory|default_value
+------------ | ----------- |----------|------------
+`context_name` | context name of OpenShift Cluster as defined in kube config file | yes |
+`namespace`| namespace of OpenShift Cluster as defined in kube config file | yes |
+
+If multiple openshift clusters are configured, the aggregator pod will be deployed in the first cluster of the list and parties will be equally split among clusters.
+
+### Debug
+1. Orchestrator logs can be found at `error.log` and `info.log` for debugging.
+2. Orchestrator collects aggregator and party logs inside experiment folder to debug aggregator and party training processes.
+3. In the event orchestrator process gets stuck at pod creation step, you can use the follow commands to debug
+ - `oc get pods` to view pod status
+ - `oc logs -f [pod_name]` to view pod logs
+ - `oc delete pod [pod_name]` to delete any hanging pods
diff --git a/openshift_fl/config_openshift_sample.yml b/openshift_fl/config_openshift_sample.yml
new file mode 100644
index 0000000..e11ec82
--- /dev/null
+++ b/openshift_fl/config_openshift_sample.yml
@@ -0,0 +1,25 @@
+cluster:
+ agg_pod:
+ cpu: 2
+ memory: 3Gi
+ party_pod:
+ cpu: 2
+ memory: 4Gi
+ kube_config_location: /demo/kubeconfig
+experiments:
+ default:
+ exec_mode: seq
+ image_name: ibmfl:latest
+ commands:
+ aggregator: ['START', 'TRAIN', 'SAVE','EVAL','STOP']
+ experiment_list:
+ - staging_dir: /demo/testexp1
+ name: testexp1
+ num_trials: 1
+ cluster_list:
+ - context_name: 'demo-cluster1'
+ namespace: 'demo-namespace1'
+ - context_name: 'demo-cluster2'
+ namespace: 'demo-namespace2'
+ data:
+ pvc_name: ibmfl-datasets-pvc
diff --git a/openshift_fl/data_copy_util.py b/openshift_fl/data_copy_util.py
new file mode 100644
index 0000000..7e8ece5
--- /dev/null
+++ b/openshift_fl/data_copy_util.py
@@ -0,0 +1,111 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
+import yaml
+from pathlib import Path
+from shutil import copyfile
+
+
+def stage_trial_files(generated_files_dir, local_trial_dir, machine_trial_dir,
+ config_agg_dict=None, config_party_dicts=None):
+ """
+ - Copy all files placed into generated_files_dir by the IBMFL generate_* scripts and place \
+ them all flat into local_trial_dir. \
+ - Update the paths inside the configs using machine_trial_dir, assuming that they will be \
+ copied there before the agg and party processes are started. \
+ - Return a dictionary with the keys corresponding to the procs ('agg', 'partyX') whose \
+ values are lists of all the files needed for each of those procs.
+
+ :param generated_files_dir: the directory passed to generate_*.py scripts; where they
+ generated the ./data and ./configs folders to place their output
+ :type ibmfl_dir: `str`
+ :param local_trial_dir: where you want the files to be copied to
+ :type local_trial_dir: `str`
+ :param machine_trial_dir: where you'll place the files before the run, to update the
+ configs; just specify "local_trial_dir" if you don't plan to move them again
+ :type machine_trial_dir: `str`
+ parse it into a dictionary and pass it here
+ :type config_agg_dict: `dict`
+ :param config_party_dicts: if you want to edit the party configs before calling this
+ function, parse them into dictionaries and pass them here in a list (ordered by party id)
+ :type config_party_dicts: `list[dict]`
+ :return: a dictionary with key for each process, listing the paths to the files it needs
+ :rtype: `dict{str,list}`
+ """
+ from collections import MutableMapping
+ from functools import reduce
+ from operator import getitem
+ import re
+
+ # flattens a dictionary
+ def flatten(d, parent_key='', sep='.'):
+ items = []
+ for k, v in d.items():
+ new_key = parent_key + sep + k if parent_key else k
+ if isinstance(v, MutableMapping):
+ items.extend(flatten(v, new_key, sep=sep).items())
+ else:
+ items.append((new_key, v))
+ return dict(items)
+
+ # accesses a value in a nested dictionary with a list of keys
+ def dict_set_nested(nested_dict, key_list, value):
+ reduce(getitem, key_list[:-1], nested_dict)[key_list[-1]] = value
+
+ # convert all our input strings into path objects
+ generated_files_pobj = Path(generated_files_dir)
+ local_trial_pobj = Path(local_trial_dir)
+ local_trial_pobj.mkdir(parents=True, exist_ok=True)
+ machine_trial_pobj = Path(machine_trial_dir)
+
+ # get all the files in our relevant folders
+ generated_files = tuple() \
+ + tuple(Path(f'{generated_files_dir}/configs').rglob('*.*')) \
+ + tuple(Path(f"{generated_files_dir}/data").rglob('*.*'))
+
+ proc_file_map = {}
+
+ # if the file is a config, open it and handle the filepaths in it
+ for file in generated_files:
+ if 'config_' in str(file):
+ proc_label = re.search('config_(.*).yml', str(file)).group(1)
+ if config_agg_dict is not None and 'agg' in proc_label:
+ orig_config = config_agg_dict
+ elif config_party_dicts is not None and 'party' in proc_label:
+ orig_config = config_party_dicts[int(proc_label[-1])]
+ else:
+ with open(file, 'r') as stream:
+ orig_config = yaml.load(stream.read(), Loader=yaml.Loader)
+ flat_config = flatten(orig_config)
+ proc_file_map[proc_label] = []
+ else:
+ continue
+ for k, v in flat_config.items():
+ # determine if this entry contains a filepath we need to handle
+ if isinstance(v, str):
+ v_pobj = Path(v)
+ if str(generated_files_pobj) in str(v_pobj):
+ g_filepath = v_pobj
+ else:
+ continue
+ else:
+ continue
+ l_filepath = local_trial_pobj.joinpath(g_filepath.name)
+ m_filepath = machine_trial_pobj.joinpath(g_filepath.name)
+ # move the file to our local trial dir
+ if not l_filepath.is_file() and 'output' not in k:
+ copyfile(g_filepath, l_filepath)
+ # don't plan to scp it to the machine if it's an output file
+ if 'output' not in k:
+ proc_file_map[proc_label] += [l_filepath]
+ # set the path in the config to the machine trial dir, where the run happens
+ dict_set_nested(orig_config, k.split('.'), str(m_filepath))
+
+ with open(f'{local_trial_dir}/{file.name}', "w") as local_trial_config_file:
+ yaml.dump(orig_config, local_trial_config_file)
+ proc_file_map[proc_label] += ['{}/{}'.format(local_trial_dir, file.name)]
+
+ return proc_file_map
diff --git a/openshift_fl/examples/iter_avg_openshift/README.md b/openshift_fl/examples/iter_avg_openshift/README.md
new file mode 100644
index 0000000..37e9504
--- /dev/null
+++ b/openshift_fl/examples/iter_avg_openshift/README.md
@@ -0,0 +1,136 @@
+
+# Running Keras CNN in FL using OpenShift Cluster
+
+This example explains how to run federated learning on CNNs implemented with Keras training on
+[MNIST](http://yann.lecun.com/exdb/mnist/) data using OpenShift cluster. Data in this example is preprocessed by scaling down to range from `[0, 255]` to `[0, 1]`.
+No other preprocessing is performed.
+
+### Setting up artifacts for the experiment
+
+- Ensure you are in the root folder of IBMFL project.
+
+- Set up the correct FL environment following our tutorial [here](https://github.com/IBM/federated-learning-lib/blob/main/quickstart.md#1-set-up-a-running-environment-for-ibm-federated-learning).
+
+- Activate a new FL environment by running:
+
+ ```
+ conda activate # activate environment
+ ```
+
+### Setting up Keras MNIST experiment
+
+- Split data by running:
+
+ ```
+ python examples/generate_data.py -n -d mnist -pp -p
+ ```
+- Generate config files by running:
+ ```
+ python examples/generate_configs.py -f iter_avg_openshift -m keras -n -d mnist -p -conf_path -context openshift
+ ```
+ Ensure that the path provided for `-p ` matches the one where the data files were written to in the data splitting step.
+
+- `staging_dir_path` passed for both split data and generate config steps should be same and should be an **absolute path**. After completion of the data splitting and the config generation steps, the `staging_dir_path` should contain the following folders:
+
+ `data` - contains party data files for train and test
+
+ `configs` - contains aggregator config file, party config files and model file
+
+ `datasets` - source dataset
+
+ `context` - project context set to openshift , so the examples are run from the openshift folders
+
+ with the following structure (for a two party run, using MNIST dataset):
+
+
+ ```
+ /
+ ├── configs
+ │  └── iter_avg_openshift
+ │ └── keras
+ │  ├── compiled_keras.h5
+ │  ├── config_agg.yml
+ │  ├── config_party0.yml
+ │  └── config_party1.yml
+ ├── data
+ │  └── mnist
+ │  └── random
+ │  ├── data_party0.npz
+ │  └── data_party1.npz
+ └── datasets
+ └── mnist.npz
+
+ ```
+ This folder structure is required by openshift runner to parse and copy files to aggregator and party pods respectively.
+
+ Default behaviour of orchestrator is to copy dataset and model artifacts to PODS, but if COS (Cloud Object Storage) is used to store datasets and model artifacts, then upload data files like `mnist.npz, data_party0.npz, data_party1.npz` and model file like `compiled_keras.h5` to COS bucket. The last section will describe how to set PVC name in orchestrator config for COS bucket access.
+
+### Build the IBMFL DockerFile Image
+
+You will need access to docker repository like `docker hub` before you can execute the below commands. The below commands assume that you are logged into docker repo (docker hub) using docker cli.
+
+- Build the IBMFL docker image
+ ```
+ docker build -t ffl-base .
+ ```
+- Tag the docker image
+ ```
+ docker tag ffl-base:[tag] [docker repo URL]/ffl-base:[tag]
+ ```
+ Please replace `[docker repo URL]` with docker repository URL and `[tag]` with image version number.
+
+- Push image to docker repo
+ ```
+ docker push [docker repo URL]/ffl-base:[tag]
+ ```
+- Edit `openshift_fl/ibmfl-base.json` file and change `DockerImage` name tag to point to pushed docker image `[docker repo URL]/ffl-base:[tag]` as shown below
+ ```
+ "from": {
+ "kind": "DockerImage",
+ "name": "[docker repo URL]/ffl-base:[tag]"
+ }
+ ```
+
+### Install the IBMFL DockerFile Image in OpenShift Clusters
+
+Refer the instructions on the IBM Cloud page to install and setup the OpenShift CLI. Install the IBMFL image in each of OpenShift Clusters by following the below commands.
+- Once you have a cluster setup and listed in the Clusters tab on cloud.ibm.com, navigate to the cluster, Click the `Actions` dropdown from the top right of your screen. Click `Connect via CLI` and follow the instructions on the pop up that shows.
+
+ ```
+ oc login --token=[token key] --server=[cluster url]
+ ```
+
+- Verify your OpenShift credentials are set up correctly to access the cluster, using a command like `oc get pods`.
+
+- Install the IBMFL image to OpenShift Image Streams
+
+ ```
+ oc apply -f openshift_fl/ibmfl-base.json
+ ```
+ Use the commands `oc get imagestreams` to view the installed IBMFL images which will start with prefix `ibmfl`.
+
+- In case you want to re-install the IBMFL image due to version change, please delete old images which starts with prefix `ibmfl` using `oc delete imagestream [image_name]` and run `oc apply -f openshift_fl/ibmfl-base.json` again.
+
+### Run the IBMFL OpenShift Orchestrator
+- Edit orchestrator config `openshift_fl/config_openshift.yml` file keys as follows :-
+
+key | description
+------------ | -----------
+`kube_config_location`| path to kube config file. Remove key `kube_config_location` from orchestrator config if you want orchestrator to use the default kube config file (~/.kube/config)
+`staging_dir`| absolute staging directory path where the keras experiment config and mnist data files are stored i.e.`staging_dir_path`
+`context_name` | context name of Openshift Cluster defined in the kube config file
+`namespace` | namespace of Openshift Cluster defined in the kube config file
+`data:pvc_name`| Persistent volume claim name (PVC) that points to the COS bucket which holds the datasets and model artifacts. Remove key `data:pvc_name` key from orchestrator config in case you want the datasets and model artifacts to be copied to PODS rather than using COS bucket.
+
+For multiple openshift clusters, add new entry to `cluster_list` with `context_name` and `namespace` of each cluster.
+
+If you plan to use COS bucket for storage, please set up Persistent Volume (PV) and Persistent Volume Claim (PVC) in OpenShift Cluster and provide the PVC name in `data:pvc_name` key.
+
+Other config keys are set with default values, if you want to modify these keys please refer to `openshift_fl/README.md` documentation.
+
+- Next, in a terminal running an activated FL environment, start the openshift runner by executing:
+ ```
+ python openshift_fl/orchestrator.py openshift_fl/examples/iter_avg_openshift/config_openshift.yml
+ ```
+
+- Aggregator and Party logs files for experiment trial runs get stored in `staging_dir/[experiment_id]/[trial_num]/logs` folder .
diff --git a/openshift_fl/examples/iter_avg_openshift/README_Multiple_Exp.md b/openshift_fl/examples/iter_avg_openshift/README_Multiple_Exp.md
new file mode 100644
index 0000000..4191a5e
--- /dev/null
+++ b/openshift_fl/examples/iter_avg_openshift/README_Multiple_Exp.md
@@ -0,0 +1,183 @@
+
+# Running Multiple Experiments - Keras and PyTorch CNN in FL using OpenShift Cluster
+
+This example explains how to run federated learning on CNNs implemented with Keras and Pytorch training on
+[MNIST](http://yann.lecun.com/exdb/mnist/) data using OpenShift cluster. Data in this example is preprocessed by scaling down to range from `[0, 255]` to `[0, 1]`.
+No other preprocessing is performed.
+
+### Setting up artifacts for the experiment
+- Set up the correct FL environment following our tutorial [here](https://github.com/IBM/federated-learning-lib/blob/main/quickstart.md#1-set-up-a-running-environment-for-ibm-federated-learning).
+
+- Activate a new FL environment by running:
+
+ ```
+ conda activate # activate environment
+ ```
+
+Next we set up the Keras and Pytorch experiments, Please ensure that `staging_dir_path_1` and `staging_dir_path_2` are seperate directories.
+
+### Setting up Keras MNIST experiment
+
+- Split data by running:
+
+ ```
+ python examples/generate_data.py -n -d mnist -pp -p
+ ```
+- Generate config files by running:
+ ```
+ python examples/generate_configs.py -f iter_avg_openshift -m keras -n -d mnist -p -conf_path -context openshift
+ ```
+ Ensure that the path provided for `-p ` matches the one where the data files were written to in the data splitting step.
+
+- `staging_dir_path_1` passed for both split data and generate config steps should be same and should be an **absolute path**. After completion of the data splitting and the config generation steps, the `staging_dir_path_1` should contain the following folders:
+
+ `data` - contains party data files for train and test
+
+ `configs` - contains aggregator config file, party config files and model file
+
+ `datasets` - source dataset
+
+ `context` - project context set to openshift , so the examples are run from the openshift folders
+
+ with the following structure (for a two party run, using MNIST dataset):
+
+
+ ```
+ /
+ ├── configs
+ │  └── iter_avg_openshift
+ │ └── keras
+ │  ├── compiled_keras.h5
+ │  ├── config_agg.yml
+ │  ├── config_party0.yml
+ │  └── config_party1.yml
+ ├── data
+ │  └── mnist
+ │  └── random
+ │  ├── data_party0.npz
+ │  └── data_party1.npz
+ └── datasets
+ └── mnist.npz
+
+ ```
+ This folder structure is required by openshift runner to parse and copy files to aggregator and party pods respectively.
+
+Default behaviour of orchestrator is to copy dataset and model artifacts to PODS, but if COS (Cloud Object Storage) is used to store datasets and model artifacts, then upload data files like `mnist.npz, data_party0.npz, data_party1.npz` and model file like `compiled_keras.h5` to COS bucket. The last section will describe how to set PVC name in orchestrator config for COS bucket access.
+### Setting up PyTorch MNIST experiment
+
+- Split data by running:
+
+ ```
+ python examples/generate_data.py -n -d mnist -pp -p
+ ```
+ This step can be skipped, if you want to use data files generated for keras experiment. In this case, copy the data files and datasets to `staging_dir_path_2`.
+
+- Generate config files by running:
+ ```
+ python examples/generate_configs.py -f iter_avg_openshift -m pytorch -n -d mnist -p -conf_path -context openshift
+ ```
+ Ensure that the path provided for `-p ` matches the one where the data files were written to in the data splitting step.
+
+- `staging_dir_path_2` passed for both split data and generate config steps should be same and should be an **absolute path**. After completion of the data splitting and the config generation steps, the `staging_dir_path_2` should contain the following folders:
+
+ `data` - contains party data files for train and test
+
+ `configs` - contains aggregator config file, party config files and model file
+
+ `datasets` - source dataset
+
+ with the following structure (for a two party run, using MNIST dataset):
+
+
+ ```
+ /
+ ├── configs
+ │  └── iter_avg_openshift
+ │ └── pytorch
+ │  ├── pytorch_sequence.pt
+ │  ├── config_agg.yml
+ │  ├── config_party0.yml
+ │  └── config_party1.yml
+ ├── data
+ │  └── mnist
+ │  └── random
+ │  ├── data_party0.npz
+ │  └── data_party1.npz
+ └── datasets
+ └── mnist.npz
+
+ ```
+ This folder structure is required by openshift runner to parse and copy files to aggregator and party pods respectively.
+
+ Default behaviour of orchestrator is to copy dataset and model artifacts to PODS, but if COS (Cloud Object Storage) is used to store datasets and model artifacts, then upload data files like `mnist.npz, data_party0.npz, data_party1.npz` and model file like `pytorch_sequence.pt` to COS bucket. The last section will describe how to set PVC name in orchestrator config for COS bucket access.
+
+### Build the IBMFL DockerFile Image
+
+You will need access to docker repository like `docker hub` before you can execute the below commands. The below commands assume that you are logged into docker repo (docker hub) using docker cli.
+
+- Build the IBMFL docker image
+ ```
+ docker build -t ffl-base .
+ ```
+- Tag the docker image
+ ```
+ docker tag ffl-base:[tag] [docker repo URL]/ffl-base:[tag]
+ ```
+ Please replace `[docker repo URL]` with docker repository URL and `[tag]` with image version number.
+
+- Push image to docker repo
+ ```
+ docker push [docker repo URL]/ffl-base:[tag]
+ ```
+- Edit `openshift_fl/ibmfl-base.json` file and change `DockerImage` name tag to point to pushed docker image `[docker repo URL]/ffl-base:[tag]` as shown below
+ ```
+ "from": {
+ "kind": "DockerImage",
+ "name": "[docker repo URL]/ffl-base:[tag]"
+ }
+ ```
+
+### Install the IBMFL DockerFile Image in OpenShift Clusters
+
+Refer the instructions on the IBM Cloud page to install and setup the OpenShift CLI. Install the IBMFL image in each of OpenShift Clusters by following the below commands.
+- Once you have a cluster setup and listed in the Clusters tab on cloud.ibm.com, navigate to the cluster, Click the `Actions` dropdown from the top right of your screen. Click `Connect via CLI` and follow the instructions on the pop up that shows.
+
+ ```
+ oc login --token=[token key] --server=[cluster url]
+ ```
+
+- Verify your OpenShift credentials are set up correctly to access the cluster, using a command like `oc get pods`.
+
+- Install the IBMFL image to OpenShift Image Streams
+
+ ```
+ oc apply -f openshift_fl/ibmfl-base.json
+ ```
+ Use the commands `oc get imagestreams` to view the installed IBMFL images which will start with prefix `ibmfl`.
+
+- In case you want to re-install the IBMFL image due to version change, please delete old images which starts with prefix `ibmfl` using `oc delete imagestream [image_name]` and run `oc apply -f openshift_fl/ibmfl-base.json` again.
+
+### Run the IBMFL OpenShift Orchestrator
+
+- Edit `openshift_fl/config_openshift_multiple_exp.yml` file keys as follows :-
+
+key | description
+--------------| -----------
+`kube_config_location`| path to kube config file. Remove key `kube_config_location` from orchestrator config if you want orchestrator to use the default kube config file (~/.kube/config)
+`staging_dir` | Absolute staging directory path where the experiment config and mnist data files are stored, i.e.`staging_dir_path_1` and `staging_dir_path_2` for kerasexp and pytorchexp respectively.
+`context_name` | context name of Openshift Cluster defined in the kube config file
+`namespace` | namespace of Openshift Cluster defined in the kube config file
+`data:pvc_name`| Persistent volume claim name (PVC) that points to the COS bucket which holds the datasets and model artifacts. Remove key `data:pvc_name` key from orchestrator config in case you want the datasets and model artifacts to be copied to PODS rather than using COS bucket.
+
+For multiple openshift cluster, add new entry to `cluster_list` with `context_name` and `namespace` of the cluster.
+
+If you plan to use COS bucket for storage, please set up Persistent Volume (PV) and Persistent Volume Claim (PVC) in OpenShift Cluster and provide the PVC name in `data:pvc_name` key.
+
+Other config keys are set with default values, if you want to modify these keys please refer to `openshift_fl/README.md` documentation.
+
+- Next, in a terminal running an activated FL environment, start the openshift runner by executing:
+ ```
+ python openshift_fl/orchestrator.py openshift_fl/examples/iter_avg_openshift/config_openshift_multiple_exp.yml
+ ```
+
+- Aggregator and Party logs files for experiment trial runs get stored in `staging_dir_path_1/[experiment_id]/[trial_num]/logs` and `staging_dir_path_2/[experiment_id]/[trial_num]/logs` folders respectively.
diff --git a/openshift_fl/examples/iter_avg_openshift/config_openshift.yml b/openshift_fl/examples/iter_avg_openshift/config_openshift.yml
new file mode 100644
index 0000000..d7c3407
--- /dev/null
+++ b/openshift_fl/examples/iter_avg_openshift/config_openshift.yml
@@ -0,0 +1,23 @@
+cluster:
+ agg_pod:
+ cpu: 2
+ memory: 3Gi
+ party_pod:
+ cpu: 2
+ memory: 4Gi
+ kube_config_location: [Path to Kube Config File]
+experiments:
+ default:
+ exec_mode: seq
+ image_name: ibmfl:latest
+ commands:
+ aggregator: ['START', 'TRAIN', 'EVAL', 'STOP']
+ experiment_list:
+ - staging_dir: absolute staging dir path where keras experiment config and mnist data files are stored>
+ name: kerasexp
+ num_trials: 1
+ cluster_list:
+ - context_name:
+ namespace:
+ data:
+ pvc_name:
\ No newline at end of file
diff --git a/openshift_fl/examples/iter_avg_openshift/config_openshift_multiple_exp.yml b/openshift_fl/examples/iter_avg_openshift/config_openshift_multiple_exp.yml
new file mode 100644
index 0000000..ad93b85
--- /dev/null
+++ b/openshift_fl/examples/iter_avg_openshift/config_openshift_multiple_exp.yml
@@ -0,0 +1,32 @@
+cluster:
+ agg_pod:
+ cpu: 2
+ memory: 3Gi
+ party_pod:
+ cpu: 2
+ memory: 4Gi
+ kube_config_location:
+experiments:
+ default:
+ exec_mode: parallel
+ image_name: ibmfl:latest
+ commands:
+ aggregator: ['START', 'TRAIN', 'EVAL', 'STOP']
+ experiment_list:
+ - staging_dir:
+ name: kerasexp
+ num_trials: 1
+ cluster_list:
+ - context_name:
+ namespace:
+ data:
+ pvc_name:
+ - staging_dir:
+ name: pytorchexp
+ num_trials: 1
+ cluster_list:
+ - context_name:
+ namespace:
+ data:
+ pvc_name:
+
diff --git a/openshift_fl/examples/iter_avg_openshift/generate_configs.py b/openshift_fl/examples/iter_avg_openshift/generate_configs.py
new file mode 100644
index 0000000..d5044b1
--- /dev/null
+++ b/openshift_fl/examples/iter_avg_openshift/generate_configs.py
@@ -0,0 +1,76 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
+import os
+import numpy as np
+from importlib import import_module
+
+import examples.datahandlers as datahandlers
+
+
+def get_fusion_config():
+ fusion = {
+ 'name': 'IterAvgFusionHandler',
+ 'path': 'ibmfl.aggregator.fusion.iter_avg_fusion_handler'
+ }
+ return fusion
+
+
+def get_local_training_config(configs_folder=None):
+ local_training_handler = {
+ 'name': 'LocalTrainingHandler',
+ 'path': 'ibmfl.party.training.local_training_handler'
+ }
+ return local_training_handler
+
+
+def get_hyperparams(model):
+ hyperparams = {
+ 'global': {
+ 'rounds': 3,
+ 'termination_accuracy': 0.9,
+ 'max_timeout': 60
+ }
+ }
+ current_module = globals().get('__package__')
+
+ model_module = import_module('{}.model_{}'.format(current_module, model))
+ local_params_method = getattr(model_module, 'get_hyperparams')
+
+ local_params = local_params_method()
+ hyperparams['local'] = local_params
+
+ return hyperparams
+
+
+def get_data_handler_config(party_id, dataset, folder_data, is_agg=False, model='keras'):
+
+ SUPPORTED_DATASETS = ['mnist', 'adult', 'cifar10', 'femnist', 'custom_dataset']
+ if dataset in SUPPORTED_DATASETS:
+ if model not in 'keras':
+ dataset = dataset + "_" + model
+
+ data = datahandlers.get_datahandler_config(
+ dataset, folder_data, party_id, is_agg)
+ else:
+ raise Exception(
+ "The dataset {} is a wrong combination for fusion/model".format(dataset))
+ return data
+
+
+def get_model_config(folder_configs, dataset, is_agg=False, party_id=0, model='keras'):
+ SUPPORTED_MODELS = ['keras', 'pytorch', 'tf', 'sklearn']
+
+ if model not in SUPPORTED_MODELS:
+ raise Exception("Invalid model config for this fusion algorithm")
+
+ current_module = globals().get('__package__')
+
+ model_module = import_module('{}.model_{}'.format(current_module, model))
+ method = getattr(model_module, 'get_model_config')
+
+ return method(folder_configs, dataset, is_agg=is_agg, party_id=0)
+
diff --git a/openshift_fl/examples/iter_avg_openshift/model_keras.py b/openshift_fl/examples/iter_avg_openshift/model_keras.py
new file mode 100644
index 0000000..0f78477
--- /dev/null
+++ b/openshift_fl/examples/iter_avg_openshift/model_keras.py
@@ -0,0 +1,189 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
+import os
+import keras
+from keras import backend as K
+from keras.layers import Conv2D, MaxPooling2D
+from keras.layers import Dense, Dropout, Flatten
+from keras.models import Sequential
+
+
+def get_hyperparams():
+ local_params = {
+ 'training': {
+ 'epochs': 3
+ },
+ 'optimizer': {
+ 'lr': 0.01
+ }
+ }
+
+ return local_params
+
+def get_model_config(folder_configs, dataset, is_agg=False, party_id=0):
+
+ if is_agg:
+ return None
+
+ if dataset == 'mnist':
+ return get_mnist_model_config(folder_configs)
+
+ elif dataset == 'cifar10':
+ return get_cifar10_model_config(folder_configs)
+
+ elif dataset == 'femnist':
+ return get_femnist_model_config(folder_configs)
+ else:
+ raise Exception(
+ "The dataset {} is a wrong combination for fusion/model".format(dataset))
+
+
+def get_mnist_model_config(folder_configs):
+
+
+ num_classes = 10
+ img_rows, img_cols = 28, 28
+ if K.image_data_format() == 'channels_first':
+ input_shape = (1, img_rows, img_cols)
+ else:
+ input_shape = (img_rows, img_cols, 1)
+
+ model = Sequential()
+ model.add(Conv2D(32, kernel_size=(3, 3),
+ activation='relu',
+ input_shape=input_shape))
+ model.add(Conv2D(64, (3, 3), activation='relu'))
+ model.add(MaxPooling2D(pool_size=(2, 2)))
+ model.add(Dropout(0.25))
+ model.add(Flatten())
+ model.add(Dense(128, activation='relu'))
+ model.add(Dropout(0.5))
+ model.add(Dense(num_classes, activation='softmax'))
+
+ model.compile(loss=keras.losses.categorical_crossentropy,
+ optimizer=keras.optimizers.Adadelta(),
+ metrics=['accuracy'])
+
+ if not os.path.exists(folder_configs):
+ os.makedirs(folder_configs)
+
+ # Save model
+ fname = os.path.join(folder_configs, 'compiled_keras.h5')
+ model.save(fname)
+
+ K.clear_session()
+ # Generate model spec:
+ spec = {
+ 'model_name': 'keras-cnn',
+ 'model_definition': fname
+ }
+
+ model = {
+ 'name': 'KerasFLModel',
+ 'path': 'ibmfl.model.keras_fl_model',
+ 'spec': spec
+ }
+
+ return model
+
+
+def get_cifar10_model_config(folder_configs):
+
+ if is_agg:
+ return None
+
+ num_classes = 10
+ img_rows, img_cols = 32, 32
+ if K.image_data_format() == 'channels_first':
+ input_shape = (3, img_rows, img_cols)
+ else:
+ input_shape = (img_rows, img_cols, 3)
+
+ model = Sequential()
+ model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
+ model.add(MaxPooling2D(pool_size=(2, 2)))
+ model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
+ model.add(MaxPooling2D(pool_size=(2, 2)))
+ model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
+ model.add(MaxPooling2D(pool_size=(2, 2)))
+ model.add(Flatten())
+ model.add(Dense(256, activation='relu'))
+ model.add(Dense(128, activation='relu'))
+ model.add(Dense(num_classes, activation='softmax'))
+
+ model.compile(loss=keras.losses.categorical_crossentropy,
+ optimizer=keras.optimizers.Adam(),
+ metrics=['accuracy'])
+ if not os.path.exists(folder_configs):
+ os.makedirs(folder_configs)
+
+ # Save model
+ fname = os.path.join(folder_configs, 'compiled_cifar10_keras.h5')
+ model.save(fname)
+
+ K.clear_session()
+ # Generate model spec:
+ spec = {
+ 'model_name': 'keras-cnn-cifar10',
+ 'model_definition': fname
+ }
+
+ model = {
+ 'name': 'KerasFLModel',
+ 'path': 'ibmfl.model.keras_fl_model',
+ 'spec': spec
+ }
+
+ return model
+
+def get_femnist_model_config(folder_configs):
+ if is_agg:
+ return None
+
+ num_classes = 62
+ img_rows, img_cols = 28, 28
+ if K.image_data_format() == 'channels_first':
+ input_shape = (1, img_rows, img_cols)
+ else:
+ input_shape = (img_rows, img_cols, 1)
+
+ model = Sequential()
+ model.add(Conv2D(32, kernel_size=(5, 5),
+ activation='relu', padding='same',
+ input_shape=input_shape))
+ model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
+
+ model.add(Conv2D(64, (5, 5), activation='relu', padding='same',))
+ model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
+ model.add(Flatten())
+ model.add(Dense(2048, activation='relu'))
+ model.add(Dense(num_classes, activation='softmax'))
+
+ model.compile(loss=keras.losses.categorical_crossentropy,
+ optimizer=keras.optimizers.SGD(),
+ metrics=['accuracy'])
+ if not os.path.exists(folder_configs):
+ os.makedirs(folder_configs)
+
+ # Save model
+ fname = os.path.join(folder_configs, 'compiled_femnist_keras.h5')
+ model.save(fname)
+
+ K.clear_session()
+ # Generate model spec:
+ spec = {
+ 'model_name': 'keras-cnn',
+ 'model_definition': fname
+ }
+
+ model = {
+ 'name': 'KerasFLModel',
+ 'path': 'ibmfl.model.keras_fl_model',
+ 'spec': spec
+ }
+
+ return model
diff --git a/openshift_fl/examples/iter_avg_openshift/model_pytorch.py b/openshift_fl/examples/iter_avg_openshift/model_pytorch.py
new file mode 100644
index 0000000..35882c2
--- /dev/null
+++ b/openshift_fl/examples/iter_avg_openshift/model_pytorch.py
@@ -0,0 +1,56 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
+import os
+import torch
+from torch import nn
+
+
+def get_hyperparams():
+ local_params = {
+ 'training': {
+ 'epochs': 3,
+ 'lr': 1
+ },
+ 'optimizer': 'optim.Adadelta'
+ }
+
+ return local_params
+
+
+def get_model_config(folder_configs, dataset, is_agg=False, party_id=0):
+
+ if is_agg:
+ return None
+ model = nn.Sequential(nn.Conv2d(1, 32, 3, 1),
+ nn.ReLU(),
+ nn.Conv2d(32, 64, 3, 1),
+ nn.ReLU(),
+ nn.MaxPool2d(2, 2),
+ nn.Dropout2d(p=0.25),
+ nn.Flatten(),
+ nn.Linear(9216, 128),
+ nn.ReLU(),
+ nn.Dropout2d(p=0.5),
+ nn.Linear(128, 10),
+ nn.LogSoftmax(dim=1)
+ )
+ if not os.path.exists(folder_configs):
+ os.makedirs(folder_configs)
+
+ # Save model
+ fname = os.path.join(folder_configs, 'pytorch_sequence.pt')
+ torch.save(model, fname)
+ spec = {
+ 'model_name': 'pytorch-nn',
+ 'model_definition': fname
+ }
+ model = {
+ 'name': 'PytorchFLModel',
+ 'path': 'ibmfl.model.pytorch_fl_model',
+ 'spec': spec,
+ }
+ return model
diff --git a/openshift_fl/experiment_runner.py b/openshift_fl/experiment_runner.py
new file mode 100644
index 0000000..9137438
--- /dev/null
+++ b/openshift_fl/experiment_runner.py
@@ -0,0 +1,343 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
+import concurrent.futures
+import logging
+import os
+
+import yaml
+from data_copy_util import stage_trial_files
+from datetime import datetime, timezone
+from kubernetes import client, watch
+from kubernetes.client.rest import ApiException
+from pathlib import Path
+from time import sleep
+import numpy as np
+import threading
+import asyncio
+
+logger = logging.getLogger(__name__)
+
+
+class ExperimentRunner:
+ """
+ ExperimentRunner runs the federated training experiments \
+ - Cordinates execution of FL commands between aggregator and party pods \
+ - Executes multiple trials of experiment \
+ - Captures the experiment trace from aggregator and party pods \
+ """
+
+ def __init__(self, default, experiment, fl_spawner_dict):
+ self.default = default
+ self.experiment = experiment
+ self.fl_spawner_dict = fl_spawner_dict
+ self.aggregator_commands = None
+ self.image_name = self.get_image_name(default, experiment)
+ self.aggregator_commands = self.get_aggregator_commands(default)
+ ts = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")
+ name = self.experiment.get('name') or 'ibmfl'
+ self.exp_id = '{}-{}'.format(name, ts)
+
+ def get_image_name(self, default, experiment):
+ """
+ Get image name to spawn the FL aggregator and party pods from config
+ :param default: dictionary contains the default configuration for all the experiments
+ :param experiment: dictionary contains info to run the experiment
+ :return:
+ """
+ image_name = 'ibmfl:latest'
+ if default is not None:
+ if default.get('image_name') is not None:
+ image_name = default.get('image_name')
+ image_name = experiment.get('image_name') or image_name
+ return image_name
+
+ def get_aggregator_commands(self, default):
+ """
+ Get aggregator commands from config
+ :param default: default configuration for all experiments
+ :return: aggregator commands eg ['START', 'TRAIN', 'SAVE','EVAL','STOP']
+ """
+ aggregator_commands = None
+ if default is not None:
+ commands = default.get('commands') or None
+ if commands is not None:
+ aggregator_commands = commands.get('aggregator') or None
+ return aggregator_commands
+
+ def get_aggregator_pod_name(self, trial_id):
+ """
+ Returns aggregator pod name, its a combination of experiment id, 'agg' string and trial id
+ :param trial_id: integer represent trial id
+ :return: string
+ """
+ return "{}-agg-{}".format(self.exp_id, trial_id)
+
+ def get_party_pod_name(self, party_index, trial_id):
+ """
+ Returns party pod name, its a combination of experiment_id, 'party' string, party index and trial id
+ :param party_index: integer represent party number
+ :param trial_id: integer represent trial id
+ :return: string
+ """
+ return "{}-party{}-{}".format(self.exp_id, party_index, trial_id)
+
+ def run_experiment(self):
+ """
+ Launches the experiment and runs multiple trials.
+ """
+ logger.info("Launching experiment with ID {}".format(self.exp_id))
+ trials_tot = self.experiment.get('num_trials') or 1
+ for trial in range(trials_tot):
+ self.run_trial(trial)
+ logger.info("Experiment {} completed ...............................................".format(self.exp_id))
+
+ def run_trial(self, trial_id):
+ """
+ Run the trial of the experiment
+ :param trial_id: integer represent trial id
+ """
+ logger.info("Trial {} run of {} started".format(trial_id, self.exp_id))
+ local_staging_dir = self.experiment['staging_dir']
+ exp_root_dir = '{}/{}'.format(local_staging_dir, self.exp_id)
+ exp_trial_dir = '{}/{}'.format(exp_root_dir, 'trial_{}'.format(trial_id))
+ exp_log_dir = '{}/{}'.format(exp_trial_dir, 'logs')
+ Path(exp_trial_dir).mkdir(parents=True, exist_ok=True)
+ os.mkdir(exp_log_dir)
+ exp_artifacts_dir = '{}/{}'.format(exp_trial_dir, 'staging')
+ pod_staging_dir = '/FL/staging_dir'
+ cos_mount_path = '/FL/datasets'
+
+ # TODO exp_artifacts_dir is used to refer the agg file, not a clean design
+ if self.experiment.get('data') is None:
+ proc_file_map = stage_trial_files(local_staging_dir, exp_artifacts_dir, pod_staging_dir)
+ else:
+ proc_file_map = stage_trial_files(local_staging_dir, exp_artifacts_dir, cos_mount_path)
+
+ agg_pod = self.get_aggregator_pod_name(trial_id)
+ # create agg pod
+ agg_cluster_context = list(self.fl_spawner_dict.keys())[0]
+ config_agg_dict = self.init_aggregator(agg_cluster_context, agg_pod, self.aggregator_commands,
+ pod_staging_dir,
+ exp_artifacts_dir, cos_mount_path,
+ self.image_name, proc_file_map, exp_log_dir)
+
+ # create agg service
+ # todo agg port from config
+
+ self.fl_spawner_dict[agg_cluster_context].create_service(agg_pod)
+
+ logger.info("Aggregator service created")
+
+ # create agg route
+ self.fl_spawner_dict[agg_cluster_context].create_route(agg_pod)
+ logger.info("Aggregator route created")
+
+ # get agg url
+ # todo rename variable
+ agg_ip = self.fl_spawner_dict[agg_cluster_context].get_route_url(agg_pod)
+
+ # create party pods
+ n_parties = config_agg_dict.get('hyperparams').get('global').get('num_parties')
+
+ # Distributed parties to cluster
+
+ fl_party_scheduler = self.schedule_parties_cluster(n_parties, self.fl_spawner_dict)
+
+ aggregator_started = False
+ pods_started = False
+ pods_stopped = False
+ w = None
+ try:
+ core_v1 = client.CoreV1Api(self.fl_spawner_dict[agg_cluster_context].k8s_client)
+ w = watch.Watch()
+ for agg_outline in w.stream(core_v1.read_namespaced_pod_log, name=agg_pod,
+ namespace=self.fl_spawner_dict[agg_cluster_context].namespace):
+ if "Aggregator start successful" in agg_outline:
+ aggregator_started = True
+ logger.info('Agg pod - {}-agg running successfully'.format(agg_pod))
+ if aggregator_started and not pods_started:
+ pods_started = self.init_parties(trial_id, agg_ip, pod_staging_dir, exp_artifacts_dir,
+ cos_mount_path,
+ self.image_name,
+ proc_file_map, exp_log_dir, fl_party_scheduler)
+ if "Aggregator stop successful" in agg_outline:
+ logger.info("Aggregator stop successful")
+ for party_index in fl_party_scheduler:
+ party_pod_name = self.get_party_pod_name(party_index, trial_id)
+ fl_party_scheduler[party_index].delete_pod(party_pod_name)
+ logger.info("Party {} pod deleted".format(party_pod_name))
+ fl_party_scheduler[party_index].delete_service(party_pod_name)
+ logger.info("Party {} service deleted".format(party_pod_name))
+ fl_party_scheduler[party_index].delete_routes(party_pod_name)
+ logger.info("Party {} route deleted".format(party_pod_name))
+
+ self.fl_spawner_dict[agg_cluster_context].delete_pod(agg_pod)
+ logger.info("Aggregator pod deleted")
+ self.fl_spawner_dict[agg_cluster_context].delete_service(agg_pod)
+ logger.info("Aggregator service deleted")
+ self.fl_spawner_dict[agg_cluster_context].delete_routes(agg_pod)
+ logger.info("Aggregator route deleted")
+ pods_stopped = True
+
+ break
+ except ApiException as e:
+ logger.error(e)
+ if not pods_stopped:
+ logger.error(e)
+
+ except ValueError as e:
+ logger.error(e)
+
+ except Exception as e:
+ logger.error(e)
+
+ finally:
+ w.stop()
+
+ logger.info('Trial {} run of {} completed ..................................'.format(trial_id, self.exp_id))
+
+ def schedule_parties_cluster(self, n_parties, fl_spawner_dict):
+ '''
+ In a multi cluster mode, the parties are split \
+ equally among clusters
+ :param n_parties: number of parties to split
+ :param fl_spawner_dict: dictionary of fl spawner
+ :return: return party scheduler with party to fl_spawner mapping
+ '''
+ fl_party_scheduler = {}
+ cluster_size = len(self.fl_spawner_dict)
+ party_split = np.array_split(range(n_parties), cluster_size)
+ index = -1;
+ for key in self.fl_spawner_dict:
+ index += 1;
+ for party_index in party_split[index]:
+ fl_party_scheduler[party_index] = fl_spawner_dict[key]
+ return fl_party_scheduler
+
+ def init_parties(self, trial_id, agg_ip, pod_staging_dir, exp_artifacts_dir, cos_mount_path, ibmfl_image,
+ proc_file_map, exp_log_dir, fl_party_scheduler):
+ """
+ Initialize the FL parties configured in the experiment, all parties are created and managed concurrently.
+ :param trial_id: integer represent the trial id
+ :param agg_ip: ip address of the aggregator
+ :param pod_staging_dir: staging directory of the pod to copy the configs and datasets
+ :param exp_artifacts_dir: local path where the config files and datasets are stored
+ :param cos_mount_path: data mount path in pods
+ :param ibmfl_image: docker image name to create the pod
+ :param n_parties: number of parties need to be created
+ :param proc_file_map: map contains the config files and datasets path
+ :param exp_log_dir: log directory path to copy the pod logs
+ :return: status of the party pod created
+ """
+ # TODO : Spawn parties in parallel by using multi threading
+ for key in fl_party_scheduler:
+ self.init_party(trial_id=trial_id, agg_ip=agg_ip,
+ pod_staging_dir=pod_staging_dir,
+ exp_artifacts_dir=exp_artifacts_dir, cos_mount_path=cos_mount_path, ibmfl_image=ibmfl_image,
+ party_index=key,
+ proc_file_map=proc_file_map, exp_log_dir=exp_log_dir, fl_spawner=fl_party_scheduler[key])
+
+ return True
+
+ def init_party(self, trial_id, agg_ip, pod_staging_dir, exp_artifacts_dir, cos_mount_path, ibmfl_image, party_index,
+ proc_file_map,
+ exp_log_dir, fl_spawner):
+ """
+ creates the FL party pods , copy config and datasets to the pod and \
+ configure to stream the pod logs
+ to the local log directory
+ :param trial_id: integer represent the trial id
+ :param agg_ip: ip address of the aggregator
+ :param pod_staging_dir: staging directory of the pod to copy the configs and datasets
+ :param exp_artifacts_dir: local path of the config files and datasets are stored
+ :param cos_mount_path: data mount path in pods
+ :param ibmfl_image: docker image name to create the pod
+ :param party_index: integer to identify the party number
+ :param proc_file_map: map contains the config files and datasets path
+ :param exp_log_dir: log directory path to copy the pod logs
+ """
+ party_pod_name = self.get_party_pod_name(party_index, trial_id)
+ fl_spawner.spawn_party(party_pod_name, party_index, pod_staging_dir, cos_mount_path, ibmfl_image)
+ logger.info("Party pod {} created successfully".format(party_pod_name))
+ party_status = fl_spawner.get_pod_status(party_pod_name)
+ while party_status != 'Running':
+ sleep(10)
+ party_status = fl_spawner.get_pod_status(party_pod_name)
+ logger.info("Party pod {} running successfully".format(party_pod_name))
+ # create service
+ fl_spawner.create_service(party_pod_name)
+
+ # create route
+ fl_spawner.create_route(party_pod_name)
+ party_ip = fl_spawner.get_route_url(party_pod_name)
+ logger.info("Party {} route - {}".format(party_pod_name, party_ip))
+ config_party_path = '{}/config_party{}.yml'.format(
+ exp_artifacts_dir, party_index)
+ with open(config_party_path) as config_party_file:
+ config_party_str = config_party_file.read()
+ config_party_dict = yaml.load(config_party_str, Loader=yaml.Loader)
+ del config_party_dict['aggregator']['ip']
+ del config_party_dict['aggregator']['port']
+ config_party_dict['connection']['info']['ip'] = '0.0.0.0'
+ config_party_dict['connection']['info']['port'] = 5000
+ config_party_dict['aggregator']['url'] = agg_ip
+ config_party_dict['connection']['info']['url'] = party_ip
+ with open(config_party_path, 'w') as config_file:
+ yaml.dump(config_party_dict, config_file)
+ fl_spawner.copy_dataset_configs_to_pods(party_pod_name,
+ proc_file_map['party{}'.format(party_index)],
+ pod_staging_dir)
+ logger.info("Copy dataset and configs to party pod {} completed".format(party_pod_name))
+ # call to capture logs
+ party_log_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+ party_log_executor.submit(fl_spawner.get_logs_from_pod, party_pod_name,
+ '{}/party-{}.out'.format(exp_log_dir, party_index))
+
+ def init_aggregator(self, agg_cluster_context, agg_pod, agg_commands, pod_staging_dir, exp_artifacts_dir,
+ cos_mount_path, ibmfl_image,
+ proc_file_map, exp_log_dir):
+ """
+ Initialize the FL aggregator includes aggregator pod creation, copy config files and datasets to \
+ aggregator pod, run the experiment, configure streaming of the aggregator pod logs
+ :param agg_pod: name of the aggregator pod to be created
+ :param agg_commands: commands to run the training configured by the user
+ :param pod_staging_dir: staging directory of the pod to copy the configs and datasets
+ :param exp_artifacts_dir: local path of the config files and datasets are stored
+ :param ibmfl_image: docker image name to create the pod
+ :param proc_file_map: map contains the config files and datasets path
+ :param exp_log_dir: log directory path to copy the pod logs
+ :param cos_mount_path: data mount path in pods
+ :return: aggreagator config file
+ """
+
+ self.fl_spawner_dict[agg_cluster_context].spawn_aggregator(agg_pod, pod_staging_dir, cos_mount_path,
+ ibmfl_image)
+ logger.info("Agg pod - {}-agg created successfully".format(self.exp_id))
+
+ # check agg status
+ pod_status = self.fl_spawner_dict[agg_cluster_context].get_pod_status(agg_pod)
+ while pod_status != 'Running':
+ sleep(10)
+ pod_status = self.fl_spawner_dict[agg_cluster_context].get_pod_status(agg_pod)
+ # update agg ip to 0.0.0.0
+ config_agg_path = '{}/config_agg.yml'.format(
+ exp_artifacts_dir)
+ with open(config_agg_path) as config_agg_file:
+ config_agg_str = config_agg_file.read()
+ config_agg_dict = yaml.load(config_agg_str, Loader=yaml.Loader)
+ config_agg_dict['connection']['info']['ip'] = '0.0.0.0'
+ with open(config_agg_path, 'w') as config_file:
+ yaml.dump(config_agg_dict, config_file)
+ # copy staging dir
+ self.fl_spawner_dict[agg_cluster_context].copy_dataset_configs_to_pods(agg_pod, proc_file_map['agg'],
+ pod_staging_dir, agg_commands)
+ logger.info("Copy datasets and configs completed for aggregator pod - {} completed".format(agg_pod))
+ agg_log_executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
+ agg_log_executor.submit(self.fl_spawner_dict[agg_cluster_context].get_logs_from_pod, pod_name=agg_pod,
+ log_path='{}/agg.out'.format(exp_log_dir))
+
+ return config_agg_dict
diff --git a/openshift_fl/fl_spawner.py b/openshift_fl/fl_spawner.py
new file mode 100644
index 0000000..2eead4b
--- /dev/null
+++ b/openshift_fl/fl_spawner.py
@@ -0,0 +1,349 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
+import logging
+import os
+
+import yaml
+from kubernetes import client, config, watch
+from kubernetes.client import ApiException
+from kubernetes.stream import stream
+from openshift.dynamic import DynamicClient
+
+logger = logging.getLogger(__name__)
+import tarfile
+from tempfile import TemporaryFile
+
+
+class FLSpawner:
+ """
+ FLSpawner creates and manage the FL aggregator and party pods in a kubernetes cluster \
+ using the kubernetes client apis.
+ """
+
+ def __init__(self, cluster, namespace, config_file=None, context=None, data=None):
+ """
+ Instantiate the FLSpawner based on the cluster info and kube config file , \
+ kube config file will be generated by kubernetes client when you setup the \
+ credentials to access the cluster
+ :param cluster: cluster dictionary contains information about namespace, \
+ cpu and memory requirements
+ :param namespace: namespace of the cluster
+ :param config_file: user provided config file, if not provided \
+ use the default kube config file
+ :param context: context of the cluster
+ :param data: pvc(persistent volume claim) name for cos bucket
+ """
+
+ self.k8s_client = config.new_client_from_config(config_file, context, True)
+ self.dynamic_client = DynamicClient(self.k8s_client)
+ self.cluster = cluster
+ self.namespace = namespace
+ self.config_file = config_file
+ self.context = context
+ self.data = data
+ __location__ = os.path.realpath(
+ os.path.join(os.getcwd(), os.path.dirname(__file__)))
+ if data is None:
+ with open(os.path.join(__location__, 'pod_template.yml')) as pod_tmpl_file:
+ self.pod_tmpl = pod_tmpl_file.read()
+ else:
+ with open(os.path.join(__location__, 'pod_persistence_template.yml')) as pod_tmpl_file:
+ self.pod_tmpl = pod_tmpl_file.read()
+ with open(os.path.join(__location__, 'service_template.yml')) as service_tmpl_file:
+ self.service_tmpl = service_tmpl_file.read()
+ with open(os.path.join(__location__, 'route_template.yml')) as route_tmpl_file:
+ self.route_tmpl = route_tmpl_file.read()
+
+ def create_pod(self, pod_name, image_name, role, command_list, cos_mount_path, cpu="2", memory="4Gi"):
+ """
+ Create pod in a kubernetes cluster based on the pod_template file
+ :param pod_name: string to specify the name of the pod
+ :param image_name: name of the docker image to create the pod
+ :param role: role to group the pods
+ :param command_list: pod start commands
+ :param cpu: cpu required to run the pod
+ :param memory: memory required to run the pod
+ :param cos_mount_path: data mount path in the pod to mount the pvc
+ """
+ if self.data is None:
+ pod = self.pod_tmpl.format(pod_name, pod_name, self.namespace, image_name, cpu, memory, command_list)
+ else:
+ pod = self.pod_tmpl.format(pod_name, pod_name, self.namespace, image_name, cos_mount_path, cpu, memory,
+ command_list, self.data.get('pvc_name'))
+ v1_pod = self.dynamic_client.resources.get(api_version='v1', kind='Pod')
+ pod_data = yaml.safe_load(pod)
+ resp = v1_pod.create(body=pod_data, namespace=self.namespace)
+
+ def get_pod_ip(self, pod_name):
+ """
+ Get pod ip address by pod name
+ :param pod_name: name of pod to fetch the ip address
+ :return: pod ip
+ """
+ v1_pods = self.dynamic_client.resources.get(api_version='v1', kind='Pod')
+
+ res = v1_pods.get(name=pod_name, namespace=self.namespace)
+ return res.status.podIP
+
+ def get_route_url(self, pod_name):
+ """
+ Get pod ip address by pod name
+ :param pod_name: name of pod to fetch the ip address
+ :return: route URL
+ """
+ v1_route = self.dynamic_client.resources.get(api_version='v1', kind='Route')
+
+ res = v1_route.get(name=pod_name, namespace=self.namespace)
+ return "https://{}".format(res.spec.host);
+
+ def get_pod_status(self, pod_name):
+ """
+ Get pod status by pod name
+ :param pod_name:
+ :return: return the pod states, pod states can be [Waiting, Running , Terminated]
+ """
+ v1_pods = self.dynamic_client.resources.get(api_version='v1', kind='Pod')
+ res = v1_pods.get(name=pod_name, namespace=self.namespace)
+ return res.status.phase
+
+ def create_service(self, pod_name):
+ """
+ Create service for pods
+ :param pod_name: pod name
+ :return: status of service
+ """
+ service = self.service_tmpl.format(pod_name, pod_name)
+ v1_services = self.dynamic_client.resources.get(api_version='v1', kind='Service')
+ service_data = yaml.safe_load(service)
+ resp = v1_services.create(body=service_data, namespace=self.namespace)
+
+ def create_route(self, pod_name):
+ """
+ Create routes for the pods
+ :param pod_name: name of the pod
+ :return: status of the routes
+ """
+ route = self.route_tmpl.format(pod_name, pod_name, pod_name)
+ v1_routes = self.dynamic_client.resources.get(api_version='route.openshift.io/v1', kind='Route')
+ route_data = yaml.safe_load(route)
+ resp = v1_routes.create(body=route_data, namespace=self.namespace)
+
+ def execute_copy_commands(self, name, source_file, destination_file):
+ """
+ NOTE: this method is currently not used since cannot handle \
+ copy of large data files
+ Copy files from local machine to pods, deprecated using there is \
+ issue when copying training files
+ :param name: name of the pod
+ :param source_file: source file
+ :param destination_file: destination file
+ """
+ core_v1 = client.CoreV1Api(self.k8s_client)
+ try:
+ exec_command = ['tar', 'xvf', '-', '-C', '/']
+ api_response = stream(core_v1.connect_get_namespaced_pod_exec, name, self.namespace,
+ command=exec_command,
+ stderr=True, stdin=True,
+ stdout=True, tty=False,
+ _preload_content=False)
+
+ with TemporaryFile() as tar_buffer:
+ with tarfile.open(fileobj=tar_buffer, mode='w') as tar:
+ tar.add(source_file, destination_file)
+
+ tar_buffer.seek(0)
+ commands = []
+ commands.append(tar_buffer.read())
+
+ while api_response.is_open():
+ api_response.update(timeout=1)
+ if api_response.peek_stdout():
+ print("STDOUT: %s" % api_response.read_stdout())
+ if api_response.peek_stderr():
+ print("STDERR: %s" % api_response.read_stderr())
+ if commands:
+ c = commands.pop(0)
+ api_response.write_stdin(c.decode())
+ else:
+ break
+ api_response.close()
+ except ApiException as e:
+ print("Exception when copying file to the pod%s \n" % e)
+
+ def execute_shell_commands(self, name, sh_cmd_list):
+ """
+ Execute the shell commands passed as parameter inside a running pod
+ :param name: name of pod to run the shell command
+ :param sh_cmd_list: sh command list
+ """
+ exec_command = ['/bin/sh']
+ # copy from local to remote
+ core_v1 = client.CoreV1Api(self.k8s_client)
+ resp = stream(core_v1.connect_get_namespaced_pod_exec,
+ name,
+ self.namespace,
+ command=exec_command,
+ stderr=True, stdin=True,
+ stdout=True, tty=False,
+ _preload_content=False)
+ commands = sh_cmd_list
+
+ while resp.is_open():
+ resp.update(timeout=1)
+ if resp.peek_stdout():
+ logger.info("STDOUT: %s" % resp.read_stdout())
+ if resp.peek_stderr():
+ logger.info("STDERR: %s" % resp.read_stderr())
+ if commands:
+ c = commands.pop(0)
+ resp.write_stdin(c + "\n")
+ else:
+ break
+
+ resp.close()
+
+ def copy_files(self, pod_name, src_file, dest_file):
+ """
+ Copy files from local machine to running pod using kubectl cp command
+ :param pod_name: name of the pod where the files to be copied
+ :param src_file: absolute path of source file
+ :param dest_file: absolute path of destination file
+ """
+ if self.config_file is None:
+ os.system(
+ "kubectl cp {} {}/{}:{}".format(src_file, self.namespace, pod_name, dest_file))
+ else:
+ os.system(
+ "kubectl config --kubeconfig={} use-context {} && kubectl --kubeconfig={} cp {} {}/{}:{}".format(
+ self.config_file, self.context, self.config_file, src_file, self.namespace, pod_name, dest_file))
+
+ def delete_pod(self, pod_name):
+ """
+ Delete a running pod by pod name
+ :param pod_name: name of the pod to be deleted
+ """
+ try:
+ core_v1 = client.CoreV1Api(self.k8s_client)
+ core_v1.delete_namespaced_pod(name=pod_name, namespace=self.namespace,
+ body=client.V1DeleteOptions())
+ except ApiException as e:
+ print('Error deleting pod {}'.format(e))
+
+ def delete_service(self, service_name):
+ """
+ Delete a running pod by pod name
+ :param service_name: name of the service to be deleted
+ """
+ try:
+ core_v1 = client.CoreV1Api(self.k8s_client)
+ core_v1.delete_namespaced_service(name=service_name, namespace=self.namespace)
+ except Exception as e:
+ logger.debug('Error deleting service {}'.format(e))
+
+ def delete_routes(self, route_name):
+ """
+ Delete routes by route name
+ :param route_name: name of the route to be deleted
+ """
+ try:
+ dynamic_client = DynamicClient(self.k8s_client)
+ v1_services = dynamic_client.resources.get(api_version='route.openshift.io/v1', kind='Route')
+ v1_services.delete(name=route_name, namespace=self.namespace)
+ except Exception as e:
+ logger.debug('Error deleting route {}'.format(e))
+
+ def get_logs_from_pod(self, pod_name, log_path):
+ """
+ Configure kubernetes watcher for the pod so that the pod logs \
+ will be streamed to the log file
+ :param pod_name: name of the pod to get the logs
+ :param log_path: path of the log file
+ """
+ with open(log_path, 'a+') as config_file:
+ try:
+ core_v1 = client.CoreV1Api(self.k8s_client)
+ w = watch.Watch()
+ for e in w.stream(core_v1.read_namespaced_pod_log, name=pod_name, namespace=self.namespace, follow=True,
+ _preload_content=False):
+ config_file.write('{}\n'.format(e))
+ except Exception as e:
+ print(e)
+ finally:
+ w.stop()
+
+ def watch_pods(self, pod_name, log_path):
+ v1_pod = self.dynamic_client.resources.get(api_version='v1', kind='Pod')
+
+ # Prints the resource that triggered each event related to Services in the 'test' namespace
+ with open(log_path, 'a+') as config_file:
+ for event in v1_pod.watch(namespace=self.namespace, name=pod_name):
+ config_file.write('{}\n'.format(event))
+
+ def spawn_aggregator(self, pod_name, pod_staging_dir, cos_mount_path, image_name):
+ """
+ Spawn a FL aggregator as a pod and start the aggregator
+ :param pod_name: name of the aggregator pod
+ :param pod_staging_dir: pod staging directory to load the configs and datasets for training
+ :param image_name: FL docker image name to create the aggregator pod
+ """
+ cpu = self.cluster['agg_pod']['cpu'] or "2"
+ memory = self.cluster['agg_pod']['memory'] or "4Gi"
+ image_name = image_name or "ibmfl:latest"
+ label_role = "ibmfl"
+ command_list = ["python3", "/FL/openshift_fl/run_agg.py", "{}/config_agg.yml".format(pod_staging_dir)]
+ self.create_pod(pod_name, image_name, label_role, command_list, cos_mount_path, cpu, memory)
+
+ def spawn_party(self, pod_name, party_index, pod_staging_dir, cos_mount_path, image_name):
+ """
+ Spawn a FL party as a pod and start the party
+ :param pod_name: name of the party pod
+ :param party_index: index to identify the party when run multiple parties
+ :param pod_staging_dir: pod staging directory to load the configs and datasets for training
+ :param image_name: FL docker image name to create party pod
+ """
+ cpu = self.cluster['party_pod']['cpu'] or "2"
+ memory = self.cluster['party_pod']['memory'] or "4Gi"
+ image_name = image_name or "ibmfl:latest"
+ label_role = "ibmfl"
+ command_list = ["python3", "/FL/openshift_fl/run_party.py",
+ "{}/config_party{}.yml".format(pod_staging_dir, party_index)]
+ self.create_pod(pod_name, image_name, label_role, command_list, cos_mount_path, cpu, memory)
+
+ def copy_dataset_configs_to_pods(self, pod_name, file_list, pod_staging_dir, commands=None):
+ """
+ copy datasets and config files to pods to run the experiments
+ :param pod_name: name of the pod to copy the files
+ :param file_list: source file list to be copied to the pod
+ :param pod_staging_dir: pod staging directory to copy the files and datasets
+ :param commands: commands configured by user to run as part of training, commands \
+ can be ['START', 'TRAIN', 'SAVE','EVAL','STOP']
+ """
+ end_of_file_marker = [
+ "echo copied >> /tmp/end_of_file_marker.txt"
+ ]
+
+ create_trial_dir_command = [
+ "mkdir {}".format(pod_staging_dir)
+ ]
+ self.execute_shell_commands(pod_name, create_trial_dir_command[:])
+ logger.info("Creating trial directory in pod - {} completed".format(pod_name))
+ for file in file_list:
+ file_base_name = os.path.basename(file)
+ if self.data is None:
+ self.copy_files(pod_name, file, '{}/{}'.format(pod_staging_dir, file_base_name))
+ logger.info("Copying file {} to pod - {} completed".format(file_base_name, pod_name))
+ else:
+ if file_base_name.endswith('.yml'):
+ self.copy_files(pod_name, file, '{}/{}'.format(pod_staging_dir, file_base_name))
+ logger.info("Copying file {} to pod - {} completed".format(file_base_name, pod_name))
+
+ if commands is not None:
+ commands_str = [
+ "echo {} >> /tmp/commands.txt".format(commands)
+ ]
+ self.execute_shell_commands(pod_name, commands_str[:])
+ self.execute_shell_commands(pod_name, end_of_file_marker[:])
diff --git a/openshift_fl/ibmfl-base.json b/openshift_fl/ibmfl-base.json
new file mode 100644
index 0000000..b77c840
--- /dev/null
+++ b/openshift_fl/ibmfl-base.json
@@ -0,0 +1,30 @@
+{
+ "kind": "List",
+ "apiVersion": "v1",
+ "items": [
+ {
+ "kind": "ImageStream",
+ "apiVersion": "image.openshift.io/v1",
+ "metadata": {
+ "name": "ibmfl"
+ },
+ "spec": {
+ "lookupPolicy": {
+ "local": true
+ },
+ "tags": [
+ {
+ "name": "latest",
+ "from": {
+ "kind": "DockerImage",
+ "name": "us.icr.io/ibmfl-openshift/ffl-base:latest"
+ },
+ "referencePolicy": {
+ "type": "Local"
+ }
+ }
+ ]
+ }
+ }
+ ]
+}
\ No newline at end of file
diff --git a/openshift_fl/orchestrator.py b/openshift_fl/orchestrator.py
new file mode 100644
index 0000000..eb2db9e
--- /dev/null
+++ b/openshift_fl/orchestrator.py
@@ -0,0 +1,149 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
+import concurrent.futures
+import logging
+import os
+import sys
+
+import re
+import yaml
+from experiment_runner import ExperimentRunner
+from fl_spawner import FLSpawner
+
+fl_path = os.path.abspath('.')
+if fl_path not in sys.path:
+ sys.path.append(fl_path)
+
+from ibmfl.util.config import configure_logging_from_file
+
+logger = logging.getLogger(__name__)
+
+
+class Orchestrator:
+ """
+ Orchestrator runs FL experiments in OpenShift clusters
+ """
+
+ def __init__(self, config_global):
+ """
+ Instantiate the orchestrator based on the config file provided \
+ :param config_global: path to yml file that contains the cluster \
+ and experiment info
+ """
+ configure_logging_from_file()
+ self.cluster = config_global.get('cluster') or None
+ self.experiments = config_global.get('experiments') or None
+ self.validate_config()
+
+ def start(self):
+ """
+ Launches the experiments configured in the yml file either in \
+ sequential or parallel execution mode
+ """
+ exec_mode = None
+ self.config_file = self.cluster.get('kube_config_location')
+ experiment_default = self.experiments.get('default') or None
+ if experiment_default is not None:
+ exec_mode = experiment_default.get('exec_mode') or None
+ experiment_list = self.experiments.get('experiment_list') or []
+ if exec_mode is not None and exec_mode == 'parallel':
+ with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
+ for experiment in experiment_list:
+ fl_spawner_dict = self.get_cluster_configs(experiment)
+ experiment_runner = ExperimentRunner(experiment_default, experiment, fl_spawner_dict)
+ executor.submit(experiment_runner.run_experiment)
+ else:
+ for experiment in experiment_list:
+ fl_spawner_dict = self.get_cluster_configs(experiment)
+ experiment_runner = ExperimentRunner(experiment_default, experiment, fl_spawner_dict)
+ experiment_runner.run_experiment()
+
+ def get_cluster_configs(self, experiment):
+ """
+ Retrieve list of cluster connect details from orchestrator config
+ :param experiment: experiment info
+ :return: dictionary (key - context_name of cluster, \
+ value - cluster info)
+ """
+ data=experiment.get('data') or None
+ cluster_list = experiment.get('cluster_list') or None
+ fl_spawner_dict = {}
+ for cluster in cluster_list:
+ context_name = cluster.get('context_name')
+ namespace = cluster.get('namespace')
+ fl_spawner_dict[context_name] = FLSpawner(self.cluster, namespace, self.config_file, context_name, data)
+ return fl_spawner_dict
+
+ def validate_config(self):
+ """
+ Validate the orchestrator config file
+ """
+ if self.cluster is None:
+ raise ValueError('Cluster cannot be empty')
+ if self.experiments is None:
+ raise ValueError('Experiments cannot be empty')
+ if self.experiments.get('experiment_list') is None:
+ raise ValueError('Experiment list cannot be empty')
+
+ default = self.experiments.get('default')
+ if default is not None:
+ commands = default.get('commands')
+ if commands is not None:
+ agg_commands = commands.get('aggregator')
+ if agg_commands is not None:
+ commands_list = ['START', 'TRAIN', 'EVAL', 'SAVE', 'STOP']
+ for agg_command in agg_commands:
+ if not any(agg_command.strip().upper() in s for s in commands_list):
+ raise ValueError('{} not a valid aggregator command'.format(agg_command))
+ exec_mode = default.get('exec_mode')
+ if exec_mode is not None:
+ exec_mode_list = ['seq', 'parallel']
+ if exec_mode not in exec_mode_list:
+ raise ValueError('{} not a valid exec mode'.format(exec_mode))
+
+ regex = '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\\\\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*'
+ experiment_list = self.experiments.get('experiment_list')
+ for experiment in experiment_list:
+ exp_name = experiment.get('name')
+ staging_dir = experiment.get('staging_dir')
+ if exp_name is not None:
+ if not re.match(regex, exp_name):
+ raise ValueError(
+ 'Name must consist of lower case alphanumeric characters, '
+ '\'-\' or \'.\', and must start and end with an alphanumeric character')
+ if staging_dir is None:
+ raise ValueError('Staging dir cannot be empty for exp : ', exp_name)
+
+ cluster_list = experiment.get('cluster_list') or None
+ if cluster_list is None:
+ raise ValueError('Atleast one cluster configuration required')
+ else:
+ for cluster in cluster_list:
+ context_name = cluster.get('context_name')
+ if context_name is None:
+ raise ValueError('Context name for cluster cannot be empty')
+ namespace = cluster.get('namespace')
+ if namespace is None:
+ raise ValueError('Namespace for cluster cannot be empty')
+ data = experiment.get('data') or None
+ if data is not None:
+ pvc_name = data.get('pvc_name')
+ if pvc_name is None:
+ raise ValueError('pvc_name for data cannot be empty')
+
+if __name__ == '__main__':
+ """
+ Main function to create orchestrator instance \
+ using yaml configuration file
+ """
+ if len(sys.argv) < 2 or len(sys.argv) > 2:
+ raise ValueError('Please provide yaml configuration')
+
+ with open(sys.argv[1]) as config_global_file:
+ config_global = yaml.load(config_global_file.read(), Loader=yaml.Loader)
+ orchestrator = Orchestrator(config_global)
+ orchestrator.start()
diff --git a/openshift_fl/pod_persistence_template.yml b/openshift_fl/pod_persistence_template.yml
new file mode 100644
index 0000000..10cafb8
--- /dev/null
+++ b/openshift_fl/pod_persistence_template.yml
@@ -0,0 +1,31 @@
+kind: Pod
+apiVersion: v1
+metadata:
+ name: {}
+ labels:
+ app: {}
+spec:
+ containers:
+ - name: ibmfl
+ image: image-registry.openshift-image-registry.svc:5000/{}/{}
+ volumeMounts:
+ - name: datasets-storage
+ mountPath: {}
+ resources:
+ limits:
+ cpu: "{}"
+ memory: "{}"
+ requests:
+ cpu: "0.5"
+ memory: "2Gi"
+ ports:
+ - name: ibmfl
+ containerPort: 80
+ protocol: TCP
+ command: {}
+ volumes:
+ - name: datasets-storage
+ persistentVolumeClaim:
+ claimName: {}
+
+
diff --git a/openshift_fl/pod_template.yml b/openshift_fl/pod_template.yml
new file mode 100644
index 0000000..eb9429e
--- /dev/null
+++ b/openshift_fl/pod_template.yml
@@ -0,0 +1,22 @@
+kind: Pod
+apiVersion: v1
+metadata:
+ name: {}
+ labels:
+ app: {}
+spec:
+ containers:
+ - name: ibmfl
+ image: image-registry.openshift-image-registry.svc:5000/{}/{}
+ resources:
+ limits:
+ cpu: "{}"
+ memory: "{}"
+ requests:
+ cpu: "0.5"
+ memory: "2Gi"
+ ports:
+ - name: ibmfl
+ containerPort: 80
+ protocol: TCP
+ command: {}
diff --git a/openshift_fl/route_template.yml b/openshift_fl/route_template.yml
new file mode 100644
index 0000000..05bd98c
--- /dev/null
+++ b/openshift_fl/route_template.yml
@@ -0,0 +1,11 @@
+apiVersion: route.openshift.io/v1
+kind: Route
+metadata:
+ name: {}
+spec:
+ to:
+ kind: Service
+ name: {}
+ tls:
+ termination: edge
+ insecureEdgeTerminationPolicy: Redirect
diff --git a/openshift_fl/run_agg.py b/openshift_fl/run_agg.py
new file mode 100644
index 0000000..a2cdaf5
--- /dev/null
+++ b/openshift_fl/run_agg.py
@@ -0,0 +1,86 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
+#!/usr/bin/env python3
+
+import logging
+import os
+import sys
+
+from time import sleep
+
+from ibmfl.aggregator.aggregator import Aggregator
+from ibmfl.aggregator.states import States
+from ibmfl.util.config import get_config_from_file
+
+fl_path = os.path.abspath('.')
+if fl_path not in sys.path:
+ sys.path.append(fl_path)
+
+logger = logging.getLogger(__name__)
+
+if __name__ == '__main__':
+ """
+ Main function can be used to create an application out \
+ of our Aggregator class which could be interactive
+ """
+ if len(sys.argv) < 2 or len(sys.argv) > 2:
+ logger.error('Please provide yaml configuration')
+
+ server_process = None
+ config_file = sys.argv[1]
+ if not os.path.isfile(config_file):
+ logger.debug("config file '{}' does not exist".format(config_file))
+
+ """
+ Check for end of file marker which indicates copy of the config \
+ and data files are completed
+ """
+ file_copy_marker = '/tmp/end_of_file_marker.txt'
+ while not os.path.isfile(file_copy_marker):
+ logger.error("Waiting for config files and datasets")
+ sleep(10)
+
+ # Read commands from config file
+ commands = ['START', 'TRAIN', 'EVAL', 'STOP']
+ if os.path.isfile('/tmp/commands.txt'):
+ with open('/tmp/commands.txt') as cmds:
+ cmd_str = cmds.read()
+ commands = cmd_str.split('[', 1)[1].split(']')[0].split(',')
+
+ config_dict = get_config_from_file(config_file)
+ n_parties = config_dict['hyperparams']['global']['num_parties']
+ logger.info("Going to wait for {} parties to register.".format(n_parties))
+ agg = Aggregator(config_file=config_file)
+ for command in commands:
+ if command.strip().lower() == ('START').lower():
+ agg.proto_handler.state = States.CLI_WAIT
+ logger.info("State: " + str(agg.proto_handler.state))
+ # Start server
+ agg.start()
+ while agg.proto_handler.get_n_parties() < n_parties:
+ sleep(1)
+ logger.info("All parties registered!")
+ sleep(10)
+ elif command.strip().lower() == ('STOP').lower():
+ logger.info("Aggregator stop successful")
+ while True:
+ logger.info("Waiting : Aggregator stop successful")
+ sleep(10)
+ #agg.stop()
+ break
+ elif command.strip().lower() == ('TRAIN').lower():
+ logger.info("State: " + str(agg.proto_handler.state))
+ agg.start_training()
+ elif command.strip().lower() == ('SAVE').lower():
+ logger.info("State: " + str(agg.proto_handler.state))
+ agg.save_model()
+ elif command.strip().lower() == ('EVAL').lower():
+ logger.info("State: " + str(agg.proto_handler.state))
+ agg.eval_model()
+ elif command.strip().lower() == ('SYNC').lower():
+ logger.info("State: " + str(agg.proto_handler.state))
+ agg.model_synch()
diff --git a/openshift_fl/run_party.py b/openshift_fl/run_party.py
new file mode 100644
index 0000000..c8a446a
--- /dev/null
+++ b/openshift_fl/run_party.py
@@ -0,0 +1,62 @@
+"""
+Licensed Materials - Property of IBM
+Restricted Materials of IBM
+20190891
+© Copyright IBM Corp. 2021 All Rights Reserved.
+"""
+#!/usr/bin/env python3
+
+import logging
+import os
+import sys
+
+from time import sleep
+
+from ibmfl.party.party import Party
+from ibmfl.party.status_type import StatusType
+
+fl_path = os.path.abspath('.')
+if fl_path not in sys.path:
+ sys.path.append(fl_path)
+
+logger = logging.getLogger(__name__)
+
+if __name__ == '__main__':
+ """
+ Main function can be used to create an application out \
+ of our Aggregator class which could be interactive
+ """
+ if len(sys.argv) < 2 or len(sys.argv) > 2:
+ logger.error('Please provide yaml configuration')
+ config_file = sys.argv[1]
+ if not os.path.isfile(config_file):
+ logger.debug("config file '{}' does not exist".format(config_file))
+
+ """
+ Check for end of file marker which indicates copy of the config \
+ and data files are completed
+ """
+ file_copy_completes = '/tmp/end_of_file_marker.txt'
+ while not os.path.isfile(file_copy_completes):
+ print("Waiting for config files and datasets")
+ sleep(10)
+
+ p = Party(config_file=config_file)
+ commands = ['START', 'REGISTER']
+ # Loop over commands passed by runner
+ for command in commands:
+ if command.lower() == ('START').lower():
+ # Start server
+ p.start()
+ if command.lower() == ('STOP').lower():
+ p.connection.stop()
+ break
+ if command.lower() == ('REGISTER').lower():
+ p.register_party()
+
+ # Stop only when aggregator tells us;
+ # in the future, dynamically deciding commands can be supported.
+ while p.proto_handler.status != StatusType.STOPPING:
+ sleep(1)
+
+ p.stop()
diff --git a/openshift_fl/service_template.yml b/openshift_fl/service_template.yml
new file mode 100644
index 0000000..27cf92d
--- /dev/null
+++ b/openshift_fl/service_template.yml
@@ -0,0 +1,11 @@
+kind: Service
+apiVersion: v1
+metadata:
+ name: {}
+spec:
+ selector:
+ app: {}
+ ports:
+ - protocol: TCP
+ port: 5000
+ targetPort: 5000
diff --git a/requirements.txt b/requirements.txt
index a756354..29b57ce 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -32,3 +32,6 @@ dm-tree
GPUtil
paramiko
http://github.com/IBM/pycloudmessenger/archive/v0.7.1.tar.gz
+jupyter
+kubernetes
+openshift
\ No newline at end of file
diff --git a/runner/README.md b/runner/README.md
deleted file mode 100644
index 4c92ab3..0000000
--- a/runner/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# IBMFL Testing Framework
-
-Here you can find scripts which allow us to easily set up and run tests for IBMFL in an automated manner. It is possible to run a series of identical trials which vary in a parameterized way without any manual command entry or otherwise.
-
-###
-* [Experiment Manager](exp_manager/usage_guide.md)
-* [Command Line runner](runner_cli.md)
diff --git a/runner/examples/pendulum/config_agg_tmpl.yml b/runner/examples/pendulum/config_agg_tmpl.yml
deleted file mode 100644
index 7d167b6..0000000
--- a/runner/examples/pendulum/config_agg_tmpl.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-connection:
- info:
- ip: ${agg_ip}
- port: ${agg_port}
- name: FlaskConnection
- path: ibmfl.connection.flask_connection
- sync: false
-fusion:
- name: RLFusionHandler
- path: ibmfl.aggregator.fusion.rl_avg_fusion_handler
-hyperparams:
- global:
- num_parties: ${n_parties}
- rounds: ${n_rounds}
-protocol_handler:
- name: ProtoHandler
- path: ibmfl.aggregator.protohandler.proto_handler
diff --git a/runner/examples/pendulum/config_party_tmpl.yml b/runner/examples/pendulum/config_party_tmpl.yml
deleted file mode 100644
index dfa7fd4..0000000
--- a/runner/examples/pendulum/config_party_tmpl.yml
+++ /dev/null
@@ -1,48 +0,0 @@
-aggregator:
- ip: ${agg_ip}
- port: ${agg_port}
-connection:
- info:
- ip: ${party_ip}
- port: ${party_port}
- name: FlaskConnection
- path: ibmfl.connection.flask_connection
- sync: false
-data:
- info:
- env_spec:
- env_definition: ibmfl.util.data_handlers.pendulum_env
- env_name: PendulumEnv
- name: PendulumEnvDataHandler
- path: ibmfl.util.data_handlers.pendulum_env_data_handler
-local_training:
- name: RLLocalTrainingHandler
- path: ibmfl.party.training.rl_local_training_handler
-model:
- name: RLlibFLModel
- path: ibmfl.model.rllib_fl_model
- spec:
- policy_definition: DDPG
- policy_name: pendulum-ddpg
- params:
- evaluation:
- run_config:
- steps: 10000
- training:
- model_config:
- eager: false
- num_gpus: 0
- num_workers: 4
- run_config:
- checkpoint_frequency: 1
- iterations: 1
-protocol_handler:
- name: PartyProtocolHandler
- path: ibmfl.party.party_protocol_handler
-metrics_recorder:
- name: MetricsRecorder
- path: ibmfl.party.metrics.metrics_recorder
- output_file: ${staging_dir}/${ts}/metrics_party${id}
- output_type: json
- compute_pre_train_eval: False
- compute_post_train_eval: True
diff --git a/runner/examples/pendulum/config_runner.yml b/runner/examples/pendulum/config_runner.yml
deleted file mode 100644
index 94b852e..0000000
--- a/runner/examples/pendulum/config_runner.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-machines:
- default:
- ssh_username: slaguna
- port_number: 8086
- staging_dir: /data/slaguna/sandbox/IBMFL/test_pendulum
- ibmfl_dir: /data/slaguna/repos/IBMFL
- venv_dir: .venv
- venv_uses_conda: False
- syl2:
- ip_address: 9.192.210.210
- port_number: 5000
- syl15:
- ip_address: 9.192.210.24
- syl23:
- ip_address: 9.192.244.87
- syl24:
- ip_address: 9.192.244.33
- syl25:
- ip_address: 9.192.244.60
- syl7:
- ip_address: 9.192.211.139
-
-experiments:
- - local_staging_dir: /home/sean/sandbox/IBMFL/test_pendulum
- local_ibmfl_dir: /home/sean/repos/IBMFL_new/IBMFL
- agg_machine: syl23
- n_parties: 2
- party_machines: [ syl24, syl25 ]
- shuffle_party_machines: False
- n_trials: 2
- n_rounds: 5
- postproc_fn: gen_reward_vs_time_plots
- postproc_x_key: 'post_train:ts'
- postproc_y_keys: [
- 'post_train:train:episode_reward_mean',
- 'post_train:train:episode_reward_max',
- 'post_train:eval:episode_reward_mean'
- ]
diff --git a/runner/exp_manager/.ipynb_checkpoints/Experiment_Manager_dashboard-checkpoint.ipynb b/runner/exp_manager/.ipynb_checkpoints/Experiment_Manager_dashboard-checkpoint.ipynb
deleted file mode 100644
index 0cdd03b..0000000
--- a/runner/exp_manager/.ipynb_checkpoints/Experiment_Manager_dashboard-checkpoint.ipynb
+++ /dev/null
@@ -1,1067 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Dashboard\n",
- "\n",
- "In this Notebook, we interact with the Experiment Manager to configure, setup and run experiments."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "code_folding": [
- 0
- ]
- },
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "34fb8ef55be644dfaa0868775be83f31",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Output()"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "# Source Code\n",
- "from __future__ import print_function\n",
- "from ipywidgets import Layout, Box, VBox, HTML, HBox, GridBox\n",
- "import ipywidgets as widgets\n",
- "\n",
- "\n",
- "import io\n",
- "import os\n",
- "import subprocess\n",
- "import yaml\n",
- "import sys\n",
- "\n",
- "sys.path.append('../..')\n",
- "import runner.run as ibmfl_runner\n",
- "import runner.postprocess as ibmfl_postproc\n",
- "import pprint as pp\n",
- "from string import Template\n",
- "\n",
- "import json\n",
- "from json import JSONDecodeError\n",
- "import pandas as pd\n",
- "\n",
- "from IPython.display import clear_output\n",
- "from ipywidgets import GridspecLayout\n",
- "\n",
- "%config Completer.use_jedi = False # to avoid autocompletion errors\n",
- "\n",
- "## Store all supported datasets, models and algorithms in a pandas dataframe \n",
- "df = pd.read_csv(filepath_or_buffer='supported_models.csv', header=0, names=['fusion_identifier', 'fusion_algo', 'dataset', 'model_spec_name', 'fl_model', 'model_ui'], skipinitialspace=True)\n",
- "# df.head()\n",
- "\n",
- "df_hyperparams = pd.read_json(path_or_buf='hyperparams_to_models_map.json')\n",
- "# df_hyperparams.head()\n",
- "\n",
- "ui_model_choices = df.model_ui.unique()\n",
- "uimodel_modelid_dict = {\n",
- " 'Keras': 'keras',\n",
- " 'PyTorch': 'pytorch',\n",
- " 'TensorFlow': 'tf',\n",
- " 'Scikit-learn': 'sklearn',\n",
- " 'None': 'none'\n",
- "}\n",
- "\n",
- "## dictionary object to store config parameters for run; TODO: replace with class later\n",
- "nb_config = {}\n",
- "### set defaults\n",
- "nb_config['split'] = {}\n",
- "nb_config['split']['ppp'] = 100\n",
- "nb_config['split']['method'] = 'Uniform Random Sampling'\n",
- "nb_config['parties'] = 5\n",
- "nb_config['quorum'] = 1\n",
- "\n",
- "exp_runner = ibmfl_runner.Runner()\n",
- "\n",
- "model_header = HTML(\n",
- " value='<{size}>Model details'.format(size='h4'),\n",
- " layout=Layout(width='auto', grid_area='model_header'))\n",
- "\n",
- "# Model Selection:\n",
- "model_dropdown = widgets.Dropdown(\n",
- " options=['Choose your model'] + list(ui_model_choices),\n",
- " description='Model:',\n",
- " disabled=False,\n",
- " layout=Layout(width='40%', grid_area='model_dr')\n",
- ")\n",
- "\n",
- "\n",
- "def model_dropdown_eventhandler(change):\n",
- " model_chosen = change.new\n",
- " global nb_config\n",
- " nb_config['model'] = model_chosen\n",
- "\n",
- "\n",
- "model_dropdown.observe(model_dropdown_eventhandler, names='value')\n",
- "\n",
- "\n",
- "dataset_header = HTML(value='<{size}>Dataset details'.format(size='h4'),\n",
- " layout=Layout(width='auto', grid_area='dataset_header'))\n",
- "\n",
- "\n",
- "dataset_dropdown = widgets.Dropdown(\n",
- " options=['Choose your dataset'],# + determine_allowed_datasets(),\n",
- " description='Dataset:',\n",
- " disabled=False,\n",
- " layout=Layout(width='80%',grid_area='dataset')\n",
- ")\n",
- "\n",
- "\n",
- "def update_supported_datasets(change):\n",
- " model_chosen = change.new\n",
- " rows_for_model = df[df.model_ui==model_chosen]\n",
- " dataset_dropdown.options = list(rows_for_model['dataset'].unique())\n",
- "\n",
- "\n",
- "model_dropdown.observe(update_supported_datasets, 'value')\n",
- "\n",
- "\n",
- "def dataset_dropdown_eventhandler(change):\n",
- " dataset_chosen = change.new\n",
- " global nb_config\n",
- " nb_config['dataset'] = dataset_chosen\n",
- "\n",
- "dataset_dropdown.observe(dataset_dropdown_eventhandler, names='value')\n",
- "\n",
- "\n",
- "# Data Splitting Strategy:\n",
- "splitting_dropdown = widgets.Box([\n",
- " widgets.Label(\n",
- " value='Data Split:',\n",
- " layout=Layout(width='auto')\n",
- " ),\n",
- " widgets.Dropdown(\n",
- " options=['Uniform Random Sampling', 'Stratified Sampling (per source class)'],\n",
- " disabled=False,\n",
- " layout=Layout(width = 'auto'),\n",
- " value='Uniform Random Sampling'\n",
- " )\n",
- "], grid_area='dataset_spl')\n",
- "\n",
- "\n",
- "def splitting_dropdown_eventhandler(change):\n",
- " split_chosen = change.new\n",
- " global nb_config\n",
- " stderr.clear_output()\n",
- " if split_chosen == '':\n",
- " with stderr:\n",
- " display('Please choose a how you\\'d like to split the data from the dropdown')\n",
- " nb_config.pop('split', None)\n",
- " else:\n",
- " nb_config['split']['method'] = split_chosen\n",
- "\n",
- "splitting_dropdown.children[1].observe(splitting_dropdown_eventhandler, names='value')\n",
- "\n",
- "\n",
- "# Points per party when splitting data:\n",
- "points_slider = widgets.Box([\n",
- " widgets.Label(\n",
- " value='Points from each party:',\n",
- " layout=Layout(width = 'auto')\n",
- " ),\n",
- " widgets.IntSlider(\n",
- " min=100,\n",
- " max=1000,\n",
- " layout=Layout(width='50%'),\n",
- " value=100\n",
- " )\n",
- "], grid_area='ppp')\n",
- "\n",
- "\n",
- "def points_slider_eventhandler(change):\n",
- " # print(change)\n",
- " ppp = change.new\n",
- " global nb_config\n",
- " stderr.clear_output()\n",
- " nb_config['split']['ppp'] = ppp\n",
- " \n",
- "\n",
- "points_slider.children[1].observe(points_slider_eventhandler, names='value')\n",
- "\n",
- "\n",
- "fusion_dropdown = widgets.Box([\n",
- " HTML(value='<{size}>Fusion Algorithm'.format(size='h4'),\n",
- " layout=Layout(width='auto')),\n",
- " widgets.Dropdown(\n",
- " options=['Choose your Fusion Algorithm'], disabled=False,\n",
- " layout=Layout(width='auto'))\n",
- "], grid_area='fusion_dr')\n",
- "\n",
- "\n",
- "def update_potential_fusion_algorithm(change):\n",
- " model_chosen = nb_config['model']\n",
- " dataset_chosen = nb_config['dataset']\n",
- " potential_algo = list(df[(df.model_ui == model_chosen) & (df.dataset == dataset_chosen)]['fusion_algo'].unique())\n",
- " fusion_dropdown.children[1].options = potential_algo\n",
- "\n",
- "\n",
- "model_dropdown.observe(update_potential_fusion_algorithm, 'value')\n",
- "dataset_dropdown.observe(update_potential_fusion_algorithm, 'value')\n",
- "\n",
- "\n",
- "def fusion_dropdown_eventhandler(change):\n",
- " fusion_algo_chosen = change.new\n",
- " global nb_config\n",
- " nb_config['fusion'] = fusion_algo_chosen\n",
- "\n",
- "\n",
- "fusion_dropdown.children[1].observe(fusion_dropdown_eventhandler, names='value')\n",
- "\n",
- "\n",
- "header_parties = HTML(value='<{size}>Participants'.format(size='h4'), layout=Layout(width='auto', grid_area='header_parties'))\n",
- "\n",
- "\n",
- "num_parties = widgets.Box([\n",
- " widgets.Label(\n",
- " value='Number of parties:',\n",
- " layout=Layout(width='auto')\n",
- " ),\n",
- " widgets.IntSlider(\n",
- " min=2,\n",
- " max=100,\n",
- " value=5,\n",
- " layout=Layout(width='50%')\n",
- " )\n",
- "], grid_area='parties')\n",
- "\n",
- "\n",
- "def num_parties_eventhandler(change):\n",
- " # print(change)\n",
- " parties = change.new\n",
- " global nb_config\n",
- " nb_config['parties'] = parties\n",
- "\n",
- "\n",
- "num_parties.children[1].observe(num_parties_eventhandler, names='value')\n",
- "\n",
- "\n",
- "parties_in_quorum = widgets.Box([\n",
- " widgets.Label(\n",
- " value='Number of parties in quorum',\n",
- " layout=Layout(width = 'auto')\n",
- " ),\n",
- " widgets.IntSlider(\n",
- " min=2,\n",
- " max=100,\n",
- " value=5,\n",
- " layout=Layout(width='50%')\n",
- " )\n",
- "], grid_area='parties')\n",
- "\n",
- "\n",
- "# quorum can have atmost all parties\n",
- "def update_quorum_range(*args):\n",
- " parties_in_quorum.children[1].max = num_parties.children[1].value\n",
- " parties_in_quorum.children[1].value = num_parties.children[1].value\n",
- "\n",
- "\n",
- "num_parties.children[1].observe(update_quorum_range, 'value')\n",
- "\n",
- "\n",
- "def parties_in_quorum_eventhandler(change):\n",
- " # print(change)\n",
- " quorum = change.new\n",
- " global nb_config\n",
- " nb_config['quorum'] = round(quorum/float(nb_config['parties']),2)\n",
- " \n",
- "\n",
- "parties_in_quorum.children[1].observe(parties_in_quorum_eventhandler, names='value')\n",
- "\n",
- "\n",
- "header_postproc = HTML(value='<{size}>Postprocessing Details'.format(size='h4'), \n",
- " layout=Layout(width='auto', grid_area='header_postproc'))\n",
- "\n",
- "\n",
- "hyperparams_dict = {}\n",
- "\n",
- "\n",
- "params_widgets = []\n",
- "\n",
- "\n",
- "gen_hyperparams = widgets.Box([\n",
- " HTML(value='<{size}>Hyperparameters'.format(size='h4'), layout=Layout(width='auto')),\n",
- " widgets.Button(\n",
- " description='Get Hyperparameters',\n",
- " disabled=False,\n",
- " button_style='warning',\n",
- " tooltip='Show available hyperparameters for the choices made',\n",
- " layout=Layout(width='auto', height='40px')\n",
- " )\n",
- "], grid_area='gen_hyper')\n",
- "\n",
- "\n",
- "confirmation_box = widgets.Box()\n",
- "\n",
- "\n",
- "hyperparams_text = widgets.Box()\n",
- "\n",
- "def populate_hyperparams(b):\n",
- " confirm_butn=widgets.Button(\n",
- " description='Confirm Hyperparameters',\n",
- " disabled=False,\n",
- " button_style='warning',\n",
- " tooltip='Saves the hyperparameter changes',\n",
- " layout=Layout(width='auto', height='40px'))\n",
- " determine_hyperparams()\n",
- " global params_widgets\n",
- " params_widgets.clear()\n",
- " generate_hyperparam_UI(hyperparams_dict)\n",
- " global hyperparams_text\n",
- " hyperparams_text.children = params_widgets\n",
- " confirmation_box.children = (confirm_butn,)\n",
- " [confirmation_box.children[i].on_click(confirmation_button_handler) for i in range(len(confirmation_box.children))]\n",
- "\n",
- "\n",
- "gen_hyperparams.children[1].on_click(populate_hyperparams)\n",
- "\n",
- "\n",
- "def determine_hyperparams():\n",
- " exp_df = df[(df.model_ui == nb_config['model']) & (df.dataset == nb_config['dataset']) & (df.fusion_algo == nb_config['fusion'])]\n",
- " if len(exp_df) != 1:\n",
- " # pick the first matching fusion algorithm\n",
- " # print('Found multiple matches, picking the first one')\n",
- " firstMatch = exp_df.iloc[0]\n",
- " # print(firstMatch)\n",
- " nb_config['fusion_identifier'] = firstMatch[0]\n",
- " else:\n",
- " # print(exp_df)\n",
- " nb_config['fusion_identifier'] = list(exp_df.fusion_identifier)[0]\n",
- " # print('fusion_id:', nb_config['fusion_identifier'])\n",
- " global hyperparams_dict\n",
- " model_hyperparams_key = nb_config['fusion_identifier'] + '_' + uimodel_modelid_dict[nb_config['model']] # to get hyperparams from df\n",
- " hyperparams_dict = df_hyperparams[df_hyperparams['model_identifier'] == model_hyperparams_key].hyperparams.values[0]\n",
- "\n",
- "\n",
- "# every model has at most two keys: global and local:\n",
- "def generate_hyperparam_UI(parameter_dict):\n",
- " # print(parameter_dict)\n",
- " for key in parameter_dict:\n",
- " if type(parameter_dict[key]) == 'dict':\n",
- " generate_hyperparam_UI(parameter_dict[key])\n",
- " else:\n",
- " global params_widgets\n",
- " params_widgets.append(widgets.Textarea(description=key, value=str(parameter_dict[key]), layout=Layout(width='400px', height='100px'), grid_area='hyperparams'))\n",
- "\n",
- "\n",
- "# Add fields for IP addresses\n",
- "local_or_remote = widgets.Box([\n",
- " HTML(value = '<{size}>Run this experiment locally or on remote machines?'.format(size='h4'),\n",
- " layout = Layout(width='auto')),\n",
- " widgets.Dropdown(\n",
- " options=['Choose your option','Run Locally', 'Run on Remote Machines'],\n",
- " description='',\n",
- " disabled=False,\n",
- " layout=Layout(width='200px')\n",
- " )\n",
- "])\n",
- "\n",
- "\n",
- "# dictionary for details of the run, which will get populated as fields get filled\n",
- "run_details = {}\n",
- "\n",
- "\n",
- "def network_details_tracker(change): \n",
- " value = change.new\n",
- " subkey = change.owner.description.split(':')[0].replace(' ', '_').lower()\n",
- " machine_key = change.owner.placeholder.split(' ')[-1]\n",
- " # update the run_details dict, depending on whether it already has some details:\n",
- " if len(run_details['machines'][machine_key].keys()) == 0:\n",
- " temp_dict = {}\n",
- " temp_dict[subkey] = value\n",
- " run_details['machines'][machine_key] = temp_dict\n",
- " else:\n",
- " temp_dict = run_details['machines'][machine_key]\n",
- " temp_dict[subkey] = value\n",
- " run_details['machines'][machine_key] = temp_dict\n",
- "\n",
- "\n",
- "def get_IPaddr_port(party_index=None):\n",
- " placeholder_suffix = ' for machine' + str(party_index)\n",
- "\n",
- " ip_addr = widgets.Text(value='', placeholder='IP Address' + placeholder_suffix, description='IP Address:')\n",
- " port_num = widgets.Text(value='', placeholder='Port Number' + placeholder_suffix, description='Port Number:')\n",
- " ssh_user = widgets.Text(value='', placeholder='ssh username' + placeholder_suffix, description='SSH user:')\n",
- " \n",
- " machine_detail_vbox = widgets.VBox(children=[ip_addr, port_num, ssh_user])\n",
- " [machine_detail_vbox.children[i].observe(network_details_tracker, 'value') for i in range(len(machine_detail_vbox.children))]\n",
- " return machine_detail_vbox\n",
- " \n",
- "\n",
- "def path_details_tracker(change):\n",
- " value = change.new\n",
- " subkey = change.owner.description.split(':')[0].replace(' ', '_').lower()\n",
- " if 'local' in change.owner.placeholder:\n",
- " # this is a local path, put within `experiments` key\n",
- " local_subkey = 'local_' + subkey\n",
- " run_details['experiments'][0][local_subkey] = value # there's only one trial for now\n",
- " else:\n",
- " # this is a machine path\n",
- " # update the run_details dict, depending on whether it already has some details:\n",
- " machine_key = change.owner.placeholder.split(' ')[-1] # to figure which machine is this for\n",
- " if len(run_details['machines'][machine_key].keys())==0:\n",
- " temp_dict = {}\n",
- " temp_dict[subkey] = value\n",
- " run_details['machines'][machine_key] = temp_dict\n",
- " else:\n",
- " temp_dict = run_details['machines'][machine_key]\n",
- " temp_dict[subkey] = value\n",
- " run_details['machines'][machine_key] = temp_dict\n",
- "\n",
- "\n",
- "def get_paths(party_index=None):\n",
- " if party_index is None:\n",
- " placeholder_suffix = ' for local machine'\n",
- " else:\n",
- " placeholder_suffix = ' for machine' + str(party_index)\n",
- "\n",
- " config_path = widgets.Text(value='', placeholder='Staging Dir' + placeholder_suffix, description='Staging Dir:')\n",
- " code_path = widgets.Text(value='', placeholder='IBMFL Dir' + placeholder_suffix, description='IBMFL Dir:')\n",
- " \n",
- " machine_detail_vbox = widgets.VBox(children=[config_path, code_path])\n",
- " [machine_detail_vbox.children[i].observe(path_details_tracker, 'value') for i in range(len(machine_detail_vbox.children))]\n",
- " return machine_detail_vbox\n",
- "\n",
- "\n",
- "networking_deets_box = widgets.VBox()\n",
- "\n",
- "\n",
- "def venv_box_isConda_handler(change):\n",
- " if change.new == 'Yes':\n",
- " run_details['machines']['venv_uses_conda'] = True\n",
- " else:\n",
- " run_details['machines']['venv_uses_conda'] = False\n",
- "\n",
- "\n",
- "def venv_box_venvPath_handler(change):\n",
- " run_details['machines']['venv_dir'] = change.new\n",
- "\n",
- "\n",
- "def display_conda_venv_fields():\n",
- " venv_box = widgets.HBox([\n",
- " widgets.RadioButtons(\n",
- " options=['Yes', 'No'],\n",
- " description='Use conda?'\n",
- " ),\n",
- " widgets.Text(\n",
- " value='',\n",
- " placeholder='.venv or conda env name',\n",
- " description='virtual env:',\n",
- " layout=Layout(width='300px', height='auto')\n",
- " )\n",
- " ])\n",
- " venv_box.children[0].observe(venv_box_isConda_handler, 'value')\n",
- " venv_box.children[1].observe(venv_box_venvPath_handler, 'value')\n",
- " return venv_box\n",
- "\n",
- "\n",
- "def run_details_text_handler(change):\n",
- " # print(change.new)\n",
- " global run_details\n",
- " try:\n",
- " run_details = json.loads(change.new)\n",
- " except JSONDecodeError:\n",
- " if change.new == '':\n",
- " pass\n",
- " else:\n",
- " display('Incorrect JSON passed for remote details, check and retry!')\n",
- " ## Todo: use an Output widget here so the message goes away once the input JSON is changed\n",
- "\n",
- "\n",
- "def machines_dropdown_eventhandler(change):\n",
- " # print(change.new)\n",
- " agg_machine = change.new.lower()\n",
- " run_details['experiments'][0]['agg_machine'] = agg_machine # there is only one trial for now\n",
- " party_machines = []\n",
- " for machine in run_details['machines']:\n",
- " party_machines.append(machine)\n",
- " \n",
- " # now remove the agg machine from the dict\n",
- " party_machines.remove(agg_machine)\n",
- " run_details['experiments'][0]['party_machines'] = party_machines # there is only one trial for now\n",
- "\n",
- "\n",
- "def display_run_details(change):\n",
- " change.owner.disabled = True\n",
- " run_details['machines'] = {}\n",
- " run_details['machines']['venv_uses_conda'] = True\n",
- " run_details['machines']['venv_dir'] = '.venv'\n",
- " run_details['experiments'] = []\n",
- " \n",
- " temp_exp_dict = {}\n",
- " temp_exp_dict['local_staging_dir'] = ''\n",
- " temp_exp_dict['local_ibmfl_dir'] = ''\n",
- " conda_fields = display_conda_venv_fields()\n",
- " \n",
- " if 'Remote' in change.new:\n",
- " ## remote execution\n",
- " ## initialise the run_details dictionary\n",
- " run_details['isLocalRun'] = False\n",
- "\n",
- " temp_exp_dict['agg_machine'] = ''\n",
- " temp_exp_dict['party_machines'] = []\n",
- "\n",
- " for eachMachine in range(nb_config['parties'] + 1):\n",
- " run_details['machines']['machine' + str(eachMachine + 1)] = {}\n",
- " run_details['machines']['machine' + str(eachMachine + 1)]['ip_address'] = ''\n",
- " run_details['machines']['machine' + str(eachMachine + 1)]['port_number'] = ''\n",
- " run_details['machines']['machine' + str(eachMachine + 1)]['ssh_username'] = ''\n",
- " run_details['machines']['machine' + str(eachMachine + 1)]['staging_dir'] = ''\n",
- " run_details['machines']['machine' + str(eachMachine + 1)]['ibmfl_dir'] = ''\n",
- "\n",
- " networking_header_1 = HTML(value='<{size}>Details for remote execution: Fill details into the textbox on the left or in individual fields on the right'.format(size='h4'), layout=Layout(width='auto'))\n",
- "\n",
- " run_details_box = widgets.VBox([\n",
- " widgets.Label(value='Machine details:', layout=Layout(width='auto')),\n",
- " widgets.Textarea(value=json.dumps(run_details, indent=4), layout=Layout(width='300px', height='700px'))\n",
- " ])\n",
- " run_details_box.children[1].observe(run_details_text_handler, 'value')\n",
- "\n",
- " networking_header_2=HTML(value = '<{size}>OR'.format(size='h3'), layout=Layout(width='auto', margin='5px 15px 5px 15px'))\n",
- "\n",
- " all_machines_tuple = ()\n",
- " for eachMachine in range(nb_config['parties'] + 1):\n",
- " machine_header = HTML(value='<{size}>Machine{id}'.format(size='h4', id=str(eachMachine+1)))\n",
- " temp_machine_box = widgets.VBox()\n",
- " machine_IP = get_IPaddr_port(eachMachine+1)\n",
- " machine_paths = get_paths(eachMachine+1)\n",
- " temp_machine_box.children = (machine_header, widgets.HBox(children=[machine_IP, machine_paths]))\n",
- " all_machines_tuple = all_machines_tuple + (temp_machine_box,)\n",
- " \n",
- " machines_dropdown = widgets.Box([\n",
- " widgets.Label(\n",
- " value='Pick machine for running Aggregator:',\n",
- " layout=Layout(width='auto')\n",
- " ),\n",
- " widgets.Dropdown(\n",
- " options=[''] + ['Machine{id}'.format(id=i+1) for i in range(nb_config['parties']+1)],\n",
- " layout=Layout(width='auto')\n",
- " )])\n",
- " \n",
- " machines_dropdown.children[1].observe(machines_dropdown_eventhandler, 'value')\n",
- "\n",
- " temp_local_vbox = widgets.VBox()\n",
- " local_header = HTML(value='<{size}>Local Directories'.format(size='h4'))\n",
- " local_path_fields = get_paths()\n",
- " temp_local_vbox.children = (local_header, local_path_fields)\n",
- "\n",
- " networking_fields_vbox = widgets.VBox(layout=Layout(width='auto', border='0.5px solid black'))\n",
- " networking_fields_vbox.children=(conda_fields,) + all_machines_tuple + (machines_dropdown, temp_local_vbox,)\n",
- " networking_deets_hbox = widgets.HBox(children=[run_details_box, networking_header_2, networking_fields_vbox])\n",
- " save_generate_butn.layout = Layout(width='185px', height='40px', margin='5px 50px 5px 400px')\n",
- " networking_deets_box.children=(networking_header_1, networking_deets_hbox, save_generate_butn,)\n",
- " run_details['experiments'].append(temp_exp_dict)\n",
- " \n",
- " else:\n",
- " ## local execution\n",
- " run_details['isLocalRun'] = True\n",
- " temp_exp_dict['agg_machine'] = 'local0'\n",
- " temp_exp_dict['party_machines'] = ['local{id}'.format(id=i+1) for i in range(nb_config['parties'])]\n",
- " \n",
- " ## setup dicts to populate IP addr and port number from generated configs later\n",
- " run_details['machines']['local0']={}\n",
- " for party in temp_exp_dict['party_machines']:\n",
- " run_details['machines'][party] = {}\n",
- " \n",
- " networking_header = HTML(value = '<{size}>Details for local execution'.format(size='h4'), layout=Layout(width='auto'))\n",
- "\n",
- " local_paths = get_paths()\n",
- " save_generate_butn.layout = Layout(width='185px', height='40px', margin='5px 50px 5px 50px')\n",
- " networking_deets_box.children=(networking_header, conda_fields, local_paths, save_generate_butn)\n",
- " \n",
- " run_details['experiments'].append(temp_exp_dict)\n",
- "\n",
- " \n",
- " with input_ui:\n",
- " display(partyDetails_grid)\n",
- " \n",
- "\n",
- "local_or_remote.children[1].observe(display_run_details, 'value')\n",
- "\n",
- "\n",
- "def display_configs_before_run(b):\n",
- " input_ui.clear_output()\n",
- " agg_conf_path, party_conf_path = generate_update_configs()\n",
- " if agg_conf_path is None or party_conf_path is None:\n",
- " with input_ui:\n",
- " display('Error generating configs. Exiting...')\n",
- " else:\n",
- " display_configs(agg_conf_path, party_conf_path)\n",
- " with input_ui:\n",
- " display(display_grid_1)\n",
- "\n",
- "\n",
- "save_generate_butn = widgets.Button(\n",
- " description='Proceed to generate configs',\n",
- " disabled=False,\n",
- " button_style='warning', # 'success', 'info', 'warning', 'danger' or ''\n",
- " tooltip='Generates config files from the above details',\n",
- " layout=Layout(width='185px', height='40px', margin='10px')\n",
- " )\n",
- "\n",
- "\n",
- "save_generate_butn.on_click(display_configs_before_run)\n",
- "\n",
- "\n",
- "def confirmation_button_handler(b):\n",
- " b.disabled = True\n",
- " b.description = 'Confirming hyperparams...'\n",
- " global params_widgets, nb_config\n",
- " for widget in params_widgets:\n",
- " nb_config[widget.description] = widget.value\n",
- " \n",
- " input_ui.clear_output()\n",
- "\n",
- " with input_ui:\n",
- " display(local_or_remote)\n",
- "\n",
- "\n",
- "def generate_update_configs():\n",
- " # Get timestamp and add it to the given local staging directory:\n",
- " nb_config['timestamp_str'] = ibmfl_runner.Runner().generate_timestamp()\n",
- " trial_dir = run_details['experiments'][0]['local_staging_dir'] + '/' + nb_config['timestamp_str']\n",
- " \n",
- " # Create the staging_directory:\n",
- " mkdir_cmd = 'mkdir -p ' + trial_dir\n",
- " process = subprocess.run(mkdir_cmd, shell=True,\n",
- " stdout=subprocess.PIPE,\n",
- " stderr=subprocess.PIPE)\n",
- " if process.returncode!=0:\n",
- " with input_ui:\n",
- " display('Erred: ', process.stderr)\n",
- " return None, None\n",
- " \n",
- " # Generate Data\n",
- " with input_ui:\n",
- " display('Generating Data...')\n",
- "\n",
- " cmd_to_run = 'cd ../../; python3 examples/generate_data.py --num_parties ' + str(nb_config['parties']) + ' -d ' + nb_config['dataset'] + ' -pp ' + str(nb_config['split']['ppp']) + ' -p ' + trial_dir # there's only one trial for now\n",
- " if 'Stratified' in nb_config['split']['method']:\n",
- " cmd_to_run = cmd_to_run + ' --stratify'\n",
- "\n",
- " # with input_ui:\n",
- " # display('Executing {}'.format(cmd_to_run))\n",
- " process = subprocess.run(cmd_to_run, shell=True,\n",
- " stdout=subprocess.PIPE, \n",
- " stderr=subprocess.PIPE)\n",
- " if process.returncode!=0:\n",
- " with input_ui:\n",
- " display('Erred: ', process.stderr)\n",
- " return None, None\n",
- "\n",
- " # path to get datasets from\n",
- " data_path = str(process.stdout).split('Data saved in')[-1].strip().replace('\\\\n\\'', '')\n",
- " with input_ui:\n",
- " display('Datasets saved to: {}'.format(data_path))\n",
- "\n",
- " # Generate Configs:\n",
- " with input_ui:\n",
- " display('Generating Configs...')\n",
- " if 'crypto' in nb_config['fusion_identifier']:\n",
- " # it is either of crypto keras or crypto_multiclass_keras, so need -crypto flags:\n",
- " # Todo: Need to let user pick one of {Paillier, ThresholdPaillier}\n",
- " cmd_to_run = 'cd ../../; python3 examples/generate_configs.py --num_parties ' + str(nb_config['parties']) + ' -f ' + nb_config['fusion_identifier'] + ' -m ' + uimodel_modelid_dict[nb_config['model']] + ' -crypto Paillier -d ' + nb_config['dataset'] + ' -p ' + data_path + ' --config_path ' + trial_dir # there's only one trial for now\n",
- " else:\n",
- " cmd_to_run = 'cd ../../; python3 examples/generate_configs.py --num_parties ' + str(nb_config['parties']) + ' -f ' + nb_config['fusion_identifier'] + ' -m ' + uimodel_modelid_dict[nb_config['model']] + ' -d ' + nb_config['dataset'] + ' -p ' + data_path + ' --config_path ' + trial_dir # there's only one trial for now\n",
- "\n",
- " # print('Executing {}'.format(cmd_to_run))\n",
- " process = subprocess.run(cmd_to_run, shell=True,\n",
- " stdout=subprocess.PIPE, \n",
- " stderr=subprocess.PIPE,\n",
- " universal_newlines=True)\n",
- " if process.returncode==0:\n",
- " # save agg and party configs path\n",
- " configs_path = os.path.dirname(process.stdout.split('\\n')[0].split(':')[1].strip())\n",
- " path_to_save_agg_configs = configs_path + '/config_agg.yml'\n",
- " print('Aggregator configs saved to: {}'.format(path_to_save_agg_configs))\n",
- " path_to_save_party_configs = configs_path + '/config_party*.yml'\n",
- " print('Party configs saved to: {}'.format(path_to_save_party_configs))\n",
- " else:\n",
- " with input_ui:\n",
- " display('Erred: ', process.stderr)\n",
- " return None, None\n",
- "\n",
- " # modify hyperparameter text to fix quotes\n",
- " hyp_text = nb_config['global']\n",
- " hyp_text = json.loads(hyp_text.replace('\\'', '\"'))\n",
- " nb_config['global'] = hyp_text\n",
- " hyp_text = nb_config['local']\n",
- " hyp_text = json.loads(hyp_text.replace('\\'', '\"'))\n",
- " if nb_config['fusion_identifier'] == 'fedplus':\n",
- " alpha = hyp_text['training'].pop('alpha')\n",
- " nb_config['local'] = hyp_text\n",
- " \n",
- " # add num_parties as a key under global, to match the structure in the agg yaml configs\n",
- " val = nb_config.pop('parties')\n",
- " nb_config['global']['num_parties'] = val\n",
- " val = nb_config.pop('quorum')\n",
- " nb_config['global']['perc_quorum'] = val\n",
- " \n",
- " # Load Aggregator Config\n",
- " with open(path_to_save_agg_configs, 'r') as stream:\n",
- " try:\n",
- " agg_config = yaml.safe_load(stream)\n",
- " except yaml.YAMLError as e:\n",
- " print(e)\n",
- " return None, None\n",
- "\n",
- " # for local runs, update the dirs to all the \"machines\" (they're all local)\n",
- " if run_details['isLocalRun']:\n",
- " run_details['machines']['ibmfl_dir'] = run_details['experiments'][0]['local_ibmfl_dir']\n",
- " run_details['machines']['staging_dir'] = run_details['experiments'][0]['local_staging_dir']\n",
- "\n",
- " # Modify aggregator config with values captured from the UI:\n",
- " # - update the hyperparameters object with newer global and local objects as updated above\n",
- " # - update ip and port from the run_details object\n",
- " # - TODO: Update model spec when uploading model file is supported\n",
- " agg_config['hyperparams']['global'] = nb_config['global']\n",
- " agg_config['hyperparams']['local'] = nb_config['local'] \n",
- " agg_machine = run_details['experiments'][0]['agg_machine'] # there's only one trial for now\n",
- "\n",
- " if not run_details['isLocalRun']:\n",
- " agg_config['connection']['info']['ip'] = run_details['machines'][agg_machine]['ip_address']\n",
- " agg_config['connection']['info']['port'] = int(run_details['machines'][agg_machine]['port_number'])\n",
- " else:\n",
- " run_details['machines'][agg_machine]['ip_address'] = agg_config['connection']['info']['ip']\n",
- " run_details['machines'][agg_machine]['port_number'] = agg_config['connection']['info']['port']\n",
- " run_details['machines'][agg_machine]['ssh_username'] = os.getenv('USER')\n",
- "\n",
- " \n",
- " # Write this updated yaml to file\n",
- " with open(path_to_save_agg_configs, 'w') as out:\n",
- " yaml.safe_dump(agg_config, out, default_flow_style=False)\n",
- " with input_ui:\n",
- " display('Updated Aggregator config at {}'.format(path_to_save_agg_configs))\n",
- "\n",
- "\n",
- " # Modify party config with values accepted from the UI\n",
- " # - update IP address, port for agg and party as received from the UI (only remote runs)\n",
- " # - add metrics section (both remote and local run)\n",
- " # - add alpha, if model chosen is Fed+\n",
- " if not run_details['isLocalRun']:\n",
- " currParty = 0\n",
- " for eachMachine in run_details['experiments'][0]['party_machines']: # there's only one trial for now\n",
- " # Load\n",
- " with open(path_to_save_party_configs.replace('*', str(currParty))) as stream:\n",
- " try:\n",
- " party_config = yaml.safe_load(stream)\n",
- " except yaml.YAMLError as e:\n",
- " print(e)\n",
- " return None, None\n",
- "\n",
- " agg_machine = run_details['experiments'][0]['agg_machine'] # there's only one trial for now\n",
- " # Modify\n",
- " party_config['aggregator']['ip'] = run_details['machines'][agg_machine]['ip_address']\n",
- " party_config['aggregator']['port'] = run_details['machines'][agg_machine]['port_number']\n",
- " \n",
- " party_config['connection']['info']['ip'] = run_details['machines'][eachMachine]['ip_address']\n",
- " party_config['connection']['info']['port'] = int(run_details['machines'][eachMachine]['port_number'])\n",
- " party_config['connection']['info']['port'] = int(run_details['machines'][eachMachine]['port_number'])\n",
- "\n",
- " # Metrics section to add to each party config\n",
- " party_config['metrics_recorder']={}\n",
- " party_config['metrics_recorder']['name'] = 'MetricsRecorder'\n",
- " party_config['metrics_recorder']['path'] = 'ibmfl.party.metrics.metrics_recorder'\n",
- " party_config['metrics_recorder']['output_file'] = '${config_dir}/metrics_party${id}'.replace('${config_dir}', run_details['machines'][eachMachine]['staging_dir']).replace('${id}', str(currParty))\n",
- " party_config['metrics_recorder']['output_type'] = 'json'\n",
- " party_config['metrics_recorder']['compute_pre_train_eval'] = False\n",
- " party_config['metrics_recorder']['compute_post_train_eval'] = True\n",
- "\n",
- " if nb_config['fusion_identifier'] == 'fedplus':\n",
- " party_config['local_training']['info']['alpha'] = alpha\n",
- "\n",
- " # Finally, write updated agg config to file\n",
- " with open(path_to_save_party_configs.replace('*', str(currParty)), 'w') as out:\n",
- " yaml.safe_dump(party_config, out, default_flow_style=False)\n",
- " currParty += 1\n",
- " else:\n",
- " currParty = 0\n",
- " for eachMachine in run_details['experiments'][0]['party_machines']: # there's only one trial for now\n",
- " # Load\n",
- " with open(path_to_save_party_configs.replace('*', str(currParty))) as stream:\n",
- " try:\n",
- " party_config = yaml.safe_load(stream)\n",
- " except yaml.YAMLError as e:\n",
- " print(e)\n",
- " return None, None\n",
- "\n",
- "\n",
- " # save IP addr and port number from the party config, into `run_details` dict, for runner's use\n",
- " run_details['machines'][eachMachine]['ip_address'] = party_config['connection']['info']['ip']\n",
- " run_details['machines'][eachMachine]['port_number'] = party_config['connection']['info']['port']\n",
- " run_details['machines'][eachMachine]['ssh_username'] = os.getenv('USER')\n",
- "\n",
- " # Metrics section to add to each party config\n",
- " party_config['metrics_recorder']={}\n",
- " party_config['metrics_recorder']['name'] = 'MetricsRecorder'\n",
- " party_config['metrics_recorder']['path'] = 'ibmfl.party.metrics.metrics_recorder'\n",
- " party_config['metrics_recorder']['output_file'] = '${config_dir}/metrics_party${id}'.replace('${config_dir}', trial_dir).replace('${id}', str(currParty))\n",
- " party_config['metrics_recorder']['output_type'] = 'json'\n",
- " party_config['metrics_recorder']['compute_pre_train_eval'] = False\n",
- " party_config['metrics_recorder']['compute_post_train_eval'] = True\n",
- "\n",
- " if nb_config['fusion_identifier'] == 'fedplus':\n",
- " party_config['local_training']['info']['alpha'] = alpha\n",
- " \n",
- " # Finally, write updated party config to file\n",
- " with open(path_to_save_party_configs.replace('*', str(currParty)), 'w') as out:\n",
- " yaml.safe_dump(party_config, out, default_flow_style=False)\n",
- "\n",
- " currParty += 1\n",
- "\n",
- " with input_ui:\n",
- " display('Updated Party configs at {}'.format(path_to_save_party_configs))\n",
- "\n",
- " nb_config['local_conf_dir'] = str(os.path.dirname(path_to_save_agg_configs))\n",
- "\n",
- " return path_to_save_agg_configs, path_to_save_party_configs\n",
- "\n",
- "config_box = widgets.VBox(layout=Layout(width='auto'))\n",
- "\n",
- "\n",
- "def display_configs(agg_conf_path, party_conf_path):\n",
- " # Display aggregator and party* configs\n",
- " display_header = HTML(value='<{size}>Configs Generated:'.format(size='h4'), layout=Layout(width='auto'))\n",
- "\n",
- " agg_conf_header = HTML(value='<{size}>Aggregator Config'.format(size='h4'), layout=Layout(width='auto'))\n",
- " agg_conf = widgets.Output(layout={'border': '0.5px solid black'})\n",
- "\n",
- " # read agg config from filesystem:\n",
- " with open(agg_conf_path) as stream:\n",
- " try:\n",
- " agg_config = yaml.safe_load(stream)\n",
- " except yaml.YAMLError as e:\n",
- " print(e)\n",
- "\n",
- " with agg_conf:\n",
- " display(agg_config)\n",
- "\n",
- " party_conf_header = HTML(value='<{size}>Party0 Config'.format(size='h4'), layout=Layout(width='auto'))\n",
- " party_conf = widgets.Output(layout={'border': '0.5px solid black'}) \n",
- "\n",
- " # read party0 from filesystem:\n",
- " with open(party_conf_path.replace('*', '0')) as stream:\n",
- " try:\n",
- " party_config = yaml.safe_load(stream)\n",
- " except yaml.YAMLError as e:\n",
- " print(e)\n",
- "\n",
- " ## display\n",
- " with party_conf:\n",
- " display(party_config)\n",
- "\n",
- " agg_box = widgets.HBox(children=[agg_conf_header, agg_conf], layout=Layout(width='auto', padding='20px'))\n",
- " party_box = widgets.HBox(children=[party_conf_header, party_conf], layout=Layout(width='auto', padding='10px'))\n",
- " party_disclmr_1 = HTML(value='Other parties follow config similar to Party0, except connection.info.[ip,port] and paths', \n",
- " layout=Layout(width='auto'))\n",
- " party_disclmr_2 = HTML(value='Also, each party gets a separate dataset file, split from the chosen dataset', \n",
- " layout=Layout(width='auto'))\n",
- " config_box.children=[display_header, agg_box, party_box, party_disclmr_1, party_disclmr_2, run_butn]\n",
- "\n",
- "\n",
- "run_butn = widgets.Button(\n",
- " description='Run Experiment',\n",
- " disabled=False,\n",
- " button_style='warning',\n",
- " tooltip='Runs the experiment with above config',\n",
- " layout=Layout(width='125px', height='40px', margin='5px 50px 5px 400px') # margin to position button centrally\n",
- " )\n",
- "\n",
- "\n",
- "monitoring_box = widgets.VBox()\n",
- "\n",
- "plot_button = widgets.Button(\n",
- " description='Show Charts',\n",
- " disabled=False,\n",
- " button_style='warning', # 'success', 'info', 'warning', 'danger' or ''\n",
- " tooltip='Displays the various plots for the experiment that ran',\n",
- " layout = Layout(width='120px', height='40px', margin='5px 50px 5px 400px') ## margin to position button centrally\n",
- " )\n",
- "\n",
- "\n",
- "def invoke_runner(b):\n",
- " b.disabled = True\n",
- " input_ui.clear_output()\n",
- " monitoring_out = widgets.Output(layout={'border': '0.5px solid black'})\n",
- " monitoring_box.children = [monitoring_out]\n",
- " display(display_grid_2)\n",
- "\n",
- " # some values needed by the Runner; there's only one trial for now\n",
- " run_details['experiments'][0]['shuffle_party_machines'] = False\n",
- " run_details['experiments'][0]['n_trials'] = 1\n",
- " run_details['experiments'][0]['n_parties'] = nb_config['global']['num_parties']\n",
- " run_details['experiments'][0]['n_rounds'] = nb_config['global']['rounds']\n",
- "\n",
- " # values for postprocessing and showing default metrics\n",
- " run_details['experiments'][0]['postproc_fn'] = {}\n",
- " run_details['experiments'][0]['postproc_fn'] = 'gen_reward_vs_time_plots'\n",
- " run_details['experiments'][0]['postproc_x_key'] = 'post_train:ts'\n",
- " run_details['experiments'][0]['postproc_y_keys'] = ['post_train:eval:loss', 'post_train:eval:acc']#, 'post_train:eval:precision weighted', 'post_train:eval:recall weighted']\n",
- "\n",
- " exp_machines = exp_runner.convert_machine_dict_from_nb_to_cli(run_details['machines'])\n",
- "\n",
- " for exp_info in run_details['experiments']:\n",
- " with open('{}/config_agg.yml'.format(nb_config['local_conf_dir']), 'r') as config_agg_file:\n",
- " config_agg = config_agg_file.read()\n",
- " config_parties = []\n",
- " for pi in range(exp_info['n_parties']):\n",
- " with open('{}/config_party{}.yml'.format(nb_config['local_conf_dir'], pi), 'r') as config_party_file:\n",
- " config_parties += [config_party_file.read()]\n",
- " with monitoring_out:\n",
- " display(exp_runner.run_experiment(exp_info, run_details['machines'],\n",
- " config_agg, config_parties, ui_mode='nb', ts=nb_config['timestamp_str']) \\\n",
- " or 'Finished!')\n",
- "\n",
- " if 'Keras' in nb_config['model']:\n",
- " monitoring_box.children = monitoring_box.children + (plot_button,)\n",
- " else:\n",
- " with monitoring_out:\n",
- " display('No plots to show for the chosen model')\n",
- "\n",
- "\n",
- "plots_box = widgets.VBox()\n",
- "\n",
- "\n",
- "def get_plots(b):\n",
- " b.disabled = True\n",
- " no_plots_for_these = ['Federated Averaging', 'Gradient Averaging', 'Probabilistic Federated Neural Matching']\n",
- " plots_out = widgets.Output(layout={'border': '0.5px solid black'})\n",
- " plots_box.children = [plots_out]\n",
- " display(display_grid_3)\n",
- " if nb_config['fusion'] in no_plots_for_these:\n",
- " with plots_out:\n",
- " display('Plots for chosen fusion algorithm are not supported yet') # metrics processing not in place\n",
- " else:\n",
- " # generate the plot\n",
- " with plots_out:\n",
- " display(exp_info = exp_runner.call_postproc_fn())\n",
- "\n",
- "\n",
- "run_butn.on_click(invoke_runner)\n",
- "\n",
- "\n",
- "plot_button.on_click(get_plots)\n",
- "\n",
- "\n",
- "# GridBox layout for UI\n",
- "grid = GridspecLayout(2,3)\n",
- "\n",
- "grid[0,:] = GridBox(children=[model_header, model_dropdown, #upload_model_file, \n",
- " dataset_header, dataset_dropdown, splitting_dropdown, points_slider,\n",
- " fusion_dropdown,\n",
- " header_parties, num_parties, parties_in_quorum,\n",
- "# header_postproc, postproc_func, postproc_xkey, postproc_ykeys,\n",
- " gen_hyperparams\n",
- " ],\n",
- " layout = Layout(\n",
- " width='100%',\n",
- " grid_template_rows='auto auto',\n",
- " grid_template_columns='48% 48%',\n",
- " grid_template_areas='''\n",
- " \"model_header model_header\"\n",
- " \"model_dr model_dr\"\n",
- " \"dataset_header dataset_header\"\n",
- " \"dataset dataset_spl\"\n",
- " \"fusion_dr fusion_dr\"\n",
- " \"header_parties header_parties\"\n",
- " \"parties parties\"\n",
- " \"gen_hyper gen_hyper\"\n",
- " ''')\n",
- " )\n",
- "# Nested grid to vary spacing across various widgets\n",
- "sub_grid_hyperparams = GridspecLayout(2,3)\n",
- "sub_grid_hyperparams[0,:] = hyperparams_text\n",
- "sub_grid_hyperparams[1,1] = confirmation_box\n",
- "\n",
- "grid[1, :] = sub_grid_hyperparams\n",
- "\n",
- "input_ui = widgets.Output()\n",
- "\n",
- "with input_ui:\n",
- " display(grid)\n",
- "\n",
- "# grid for displaying networking fields -- IP addr, port, ssh user, paths\n",
- "partyDetails_grid = GridspecLayout(1,3)\n",
- "partyDetails_grid[0, :] = networking_deets_box\n",
- "\n",
- "# grid for displaying generated configuration\n",
- "display_grid_1 = GridspecLayout(1,3)\n",
- "display_grid_1[0, :] = config_box\n",
- "\n",
- "# grid for displaying progress of running experiment\n",
- "display_grid_2 = GridspecLayout(1,1)\n",
- "display_grid_2[0, :] = monitoring_box\n",
- "\n",
- "# grid for displaying charts from collected metrics\n",
- "display_grid_3 = GridspecLayout(1,1)\n",
- "display_grid_3[0, :] = plots_box\n",
- "\n",
- "input_ui"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.8"
- },
- "varInspector": {
- "cols": {
- "lenName": 16,
- "lenType": 16,
- "lenVar": 40
- },
- "kernels_config": {
- "python": {
- "delete_cmd_postfix": "",
- "delete_cmd_prefix": "del ",
- "library": "var_list.py",
- "varRefreshCmd": "print(var_dic_list())"
- },
- "r": {
- "delete_cmd_postfix": ") ",
- "delete_cmd_prefix": "rm(",
- "library": "var_list.r",
- "varRefreshCmd": "cat(var_dic_list()) "
- }
- },
- "types_to_exclude": [
- "module",
- "function",
- "builtin_function_or_method",
- "instance",
- "_Feature"
- ],
- "window_display": false
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/runner/exp_manager/Experiment_Manager_dashboard.ipynb b/runner/exp_manager/Experiment_Manager_dashboard.ipynb
deleted file mode 100644
index 7bfb187..0000000
--- a/runner/exp_manager/Experiment_Manager_dashboard.ipynb
+++ /dev/null
@@ -1,1069 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Dashboard\n",
- "\n",
- "In this Notebook, we interact with the Experiment Manager to configure, setup and run experiments."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "code_folding": [
- 0,
- 47,
- 52,
- 73,
- 81,
- 90,
- 99,
- 113,
- 128,
- 142,
- 162,
- 173,
- 185,
- 199,
- 209,
- 224,
- 232,
- 432
- ]
- },
- "outputs": [],
- "source": [
- "# Source Code\n",
- "from __future__ import print_function\n",
- "from ipywidgets import Layout, Box, VBox, HTML, HBox, GridBox\n",
- "import ipywidgets as widgets\n",
- "\n",
- "\n",
- "import io\n",
- "import os\n",
- "import subprocess\n",
- "import yaml\n",
- "import sys\n",
- "\n",
- "sys.path.append('../..')\n",
- "import runner.run as ibmfl_runner\n",
- "import runner.postprocess as ibmfl_postproc\n",
- "import pprint as pp\n",
- "from string import Template\n",
- "\n",
- "import json\n",
- "from json import JSONDecodeError\n",
- "import pandas as pd\n",
- "\n",
- "from IPython.display import clear_output\n",
- "from ipywidgets import GridspecLayout\n",
- "\n",
- "%config Completer.use_jedi = False # to avoid autocompletion errors\n",
- "\n",
- "## Store all supported datasets, models and algorithms in a pandas dataframe \n",
- "df = pd.read_csv(filepath_or_buffer='supported_models.csv', header=0, names=['fusion_identifier', 'fusion_algo', 'dataset', 'model_spec_name', 'fl_model', 'model_ui'], skipinitialspace=True)\n",
- "# df.head()\n",
- "\n",
- "df_hyperparams = pd.read_json(path_or_buf='hyperparams_to_models_map.json')\n",
- "# df_hyperparams.head()\n",
- "\n",
- "ui_model_choices = df.model_ui.unique()\n",
- "uimodel_modelid_dict = {\n",
- " 'Keras': 'keras',\n",
- " 'PyTorch': 'pytorch',\n",
- " 'TensorFlow': 'tf',\n",
- " 'Scikit-learn': 'sklearn',\n",
- " 'None': 'none'\n",
- "}\n",
- "\n",
- "## dictionary object to store config parameters for run; TODO: replace with class later\n",
- "nb_config = {}\n",
- "### set defaults\n",
- "nb_config['split'] = {}\n",
- "nb_config['split']['ppp'] = 100\n",
- "nb_config['split']['method'] = 'Uniform Random Sampling'\n",
- "nb_config['parties'] = 5\n",
- "nb_config['quorum'] = 1\n",
- "\n",
- "exp_runner = ibmfl_runner.Runner()\n",
- "\n",
- "model_header = HTML(\n",
- " value='<{size}>Model details'.format(size='h4'),\n",
- " layout=Layout(width='auto', grid_area='model_header'))\n",
- "\n",
- "# Model Selection:\n",
- "model_dropdown = widgets.Dropdown(\n",
- " options=['Choose your model'] + list(ui_model_choices),\n",
- " description='Model:',\n",
- " disabled=False,\n",
- " layout=Layout(width='40%', grid_area='model_dr')\n",
- ")\n",
- "\n",
- "\n",
- "def model_dropdown_eventhandler(change):\n",
- " model_chosen = change.new\n",
- " global nb_config\n",
- " nb_config['model'] = model_chosen\n",
- "\n",
- "\n",
- "model_dropdown.observe(model_dropdown_eventhandler, names='value')\n",
- "\n",
- "\n",
- "dataset_header = HTML(value='<{size}>Dataset details'.format(size='h4'),\n",
- " layout=Layout(width='auto', grid_area='dataset_header'))\n",
- "\n",
- "\n",
- "dataset_dropdown = widgets.Dropdown(\n",
- " options=['Choose your dataset'],# + determine_allowed_datasets(),\n",
- " description='Dataset:',\n",
- " disabled=False,\n",
- " layout=Layout(width='80%',grid_area='dataset')\n",
- ")\n",
- "\n",
- "\n",
- "def update_supported_datasets(change):\n",
- " model_chosen = change.new\n",
- " rows_for_model = df[df.model_ui==model_chosen]\n",
- " dataset_dropdown.options = list(rows_for_model['dataset'].unique())\n",
- "\n",
- "\n",
- "model_dropdown.observe(update_supported_datasets, 'value')\n",
- "\n",
- "\n",
- "def dataset_dropdown_eventhandler(change):\n",
- " dataset_chosen = change.new\n",
- " global nb_config\n",
- " nb_config['dataset'] = dataset_chosen\n",
- "\n",
- "dataset_dropdown.observe(dataset_dropdown_eventhandler, names='value')\n",
- "\n",
- "\n",
- "# Data Splitting Strategy:\n",
- "splitting_dropdown = widgets.Box([\n",
- " widgets.Label(\n",
- " value='Data Split:',\n",
- " layout=Layout(width='auto')\n",
- " ),\n",
- " widgets.Dropdown(\n",
- " options=['Uniform Random Sampling', 'Stratified Sampling (per source class)'],\n",
- " disabled=False,\n",
- " layout=Layout(width = 'auto'),\n",
- " value='Uniform Random Sampling'\n",
- " )\n",
- "], grid_area='dataset_spl')\n",
- "\n",
- "\n",
- "def splitting_dropdown_eventhandler(change):\n",
- " split_chosen = change.new\n",
- " global nb_config\n",
- " stderr.clear_output()\n",
- " if split_chosen == '':\n",
- " with stderr:\n",
- " display('Please choose a how you\\'d like to split the data from the dropdown')\n",
- " nb_config.pop('split', None)\n",
- " else:\n",
- " nb_config['split']['method'] = split_chosen\n",
- "\n",
- "splitting_dropdown.children[1].observe(splitting_dropdown_eventhandler, names='value')\n",
- "\n",
- "\n",
- "# Points per party when splitting data:\n",
- "points_slider = widgets.Box([\n",
- " widgets.Label(\n",
- " value='Points from each party:',\n",
- " layout=Layout(width = 'auto')\n",
- " ),\n",
- " widgets.IntSlider(\n",
- " min=100,\n",
- " max=1000,\n",
- " layout=Layout(width='50%'),\n",
- " value=100\n",
- " )\n",
- "], grid_area='ppp')\n",
- "\n",
- "\n",
- "def points_slider_eventhandler(change):\n",
- " # print(change)\n",
- " ppp = change.new\n",
- " global nb_config\n",
- " stderr.clear_output()\n",
- " nb_config['split']['ppp'] = ppp\n",
- " \n",
- "\n",
- "points_slider.children[1].observe(points_slider_eventhandler, names='value')\n",
- "\n",
- "\n",
- "fusion_dropdown = widgets.Box([\n",
- " HTML(value='<{size}>Fusion Algorithm'.format(size='h4'),\n",
- " layout=Layout(width='auto')),\n",
- " widgets.Dropdown(\n",
- " options=['Choose your Fusion Algorithm'], disabled=False,\n",
- " layout=Layout(width='auto'))\n",
- "], grid_area='fusion_dr')\n",
- "\n",
- "\n",
- "def update_potential_fusion_algorithm(change):\n",
- " model_chosen = nb_config['model']\n",
- " dataset_chosen = nb_config['dataset']\n",
- " potential_algo = list(df[(df.model_ui == model_chosen) & (df.dataset == dataset_chosen)]['fusion_algo'].unique())\n",
- " fusion_dropdown.children[1].options = potential_algo\n",
- "\n",
- "\n",
- "model_dropdown.observe(update_potential_fusion_algorithm, 'value')\n",
- "dataset_dropdown.observe(update_potential_fusion_algorithm, 'value')\n",
- "\n",
- "\n",
- "def fusion_dropdown_eventhandler(change):\n",
- " fusion_algo_chosen = change.new\n",
- " global nb_config\n",
- " nb_config['fusion'] = fusion_algo_chosen\n",
- "\n",
- "\n",
- "fusion_dropdown.children[1].observe(fusion_dropdown_eventhandler, names='value')\n",
- "\n",
- "\n",
- "header_parties = HTML(value='<{size}>Participants'.format(size='h4'), layout=Layout(width='auto', grid_area='header_parties'))\n",
- "\n",
- "\n",
- "num_parties = widgets.Box([\n",
- " widgets.Label(\n",
- " value='Number of parties:',\n",
- " layout=Layout(width='auto')\n",
- " ),\n",
- " widgets.IntSlider(\n",
- " min=2,\n",
- " max=100,\n",
- " value=5,\n",
- " layout=Layout(width='50%')\n",
- " )\n",
- "], grid_area='parties')\n",
- "\n",
- "\n",
- "def num_parties_eventhandler(change):\n",
- " # print(change)\n",
- " parties = change.new\n",
- " global nb_config\n",
- " nb_config['parties'] = parties\n",
- "\n",
- "\n",
- "num_parties.children[1].observe(num_parties_eventhandler, names='value')\n",
- "\n",
- "\n",
- "parties_in_quorum = widgets.Box([\n",
- " widgets.Label(\n",
- " value='Number of parties in quorum',\n",
- " layout=Layout(width = 'auto')\n",
- " ),\n",
- " widgets.IntSlider(\n",
- " min=2,\n",
- " max=100,\n",
- " value=5,\n",
- " layout=Layout(width='50%')\n",
- " )\n",
- "], grid_area='parties')\n",
- "\n",
- "\n",
- "# quorum can have atmost all parties\n",
- "def update_quorum_range(*args):\n",
- " parties_in_quorum.children[1].max = num_parties.children[1].value\n",
- " parties_in_quorum.children[1].value = num_parties.children[1].value\n",
- "\n",
- "\n",
- "num_parties.children[1].observe(update_quorum_range, 'value')\n",
- "\n",
- "\n",
- "def parties_in_quorum_eventhandler(change):\n",
- " # print(change)\n",
- " quorum = change.new\n",
- " global nb_config\n",
- " nb_config['quorum'] = round(quorum/float(nb_config['parties']),2)\n",
- " \n",
- "\n",
- "parties_in_quorum.children[1].observe(parties_in_quorum_eventhandler, names='value')\n",
- "\n",
- "\n",
- "header_postproc = HTML(value='<{size}>Postprocessing Details'.format(size='h4'), \n",
- " layout=Layout(width='auto', grid_area='header_postproc'))\n",
- "\n",
- "\n",
- "hyperparams_dict = {}\n",
- "\n",
- "\n",
- "params_widgets = []\n",
- "\n",
- "\n",
- "gen_hyperparams = widgets.Box([\n",
- " HTML(value='<{size}>Hyperparameters'.format(size='h4'), layout=Layout(width='auto')),\n",
- " widgets.Button(\n",
- " description='Get Hyperparameters',\n",
- " disabled=False,\n",
- " button_style='warning',\n",
- " tooltip='Show available hyperparameters for the choices made',\n",
- " layout=Layout(width='auto', height='40px')\n",
- " )\n",
- "], grid_area='gen_hyper')\n",
- "\n",
- "\n",
- "confirmation_box = widgets.Box()\n",
- "\n",
- "\n",
- "hyperparams_text = widgets.Box()\n",
- "\n",
- "def populate_hyperparams(b):\n",
- " confirm_butn=widgets.Button(\n",
- " description='Confirm Hyperparameters',\n",
- " disabled=False,\n",
- " button_style='warning',\n",
- " tooltip='Saves the hyperparameter changes',\n",
- " layout=Layout(width='auto', height='40px'))\n",
- " determine_hyperparams()\n",
- " global params_widgets\n",
- " params_widgets.clear()\n",
- " generate_hyperparam_UI(hyperparams_dict)\n",
- " global hyperparams_text\n",
- " hyperparams_text.children = params_widgets\n",
- " confirmation_box.children = (confirm_butn,)\n",
- " [confirmation_box.children[i].on_click(confirmation_button_handler) for i in range(len(confirmation_box.children))]\n",
- "\n",
- "\n",
- "gen_hyperparams.children[1].on_click(populate_hyperparams)\n",
- "\n",
- "\n",
- "def determine_hyperparams():\n",
- " exp_df = df[(df.model_ui == nb_config['model']) & (df.dataset == nb_config['dataset']) & (df.fusion_algo == nb_config['fusion'])]\n",
- " if len(exp_df) != 1:\n",
- " # pick the first matching fusion algorithm\n",
- " # print('Found multiple matches, picking the first one')\n",
- " firstMatch = exp_df.iloc[0]\n",
- " # print(firstMatch)\n",
- " nb_config['fusion_identifier'] = firstMatch[0]\n",
- " else:\n",
- " # print(exp_df)\n",
- " nb_config['fusion_identifier'] = list(exp_df.fusion_identifier)[0]\n",
- " # print('fusion_id:', nb_config['fusion_identifier'])\n",
- " global hyperparams_dict\n",
- " model_hyperparams_key = nb_config['fusion_identifier'] + '_' + uimodel_modelid_dict[nb_config['model']] # to get hyperparams from df\n",
- " hyperparams_dict = df_hyperparams[df_hyperparams['model_identifier'] == model_hyperparams_key].hyperparams.values[0]\n",
- "\n",
- "\n",
- "# every model has at most two keys: global and local:\n",
- "def generate_hyperparam_UI(parameter_dict):\n",
- " # print(parameter_dict)\n",
- " for key in parameter_dict:\n",
- " if type(parameter_dict[key]) == 'dict':\n",
- " generate_hyperparam_UI(parameter_dict[key])\n",
- " else:\n",
- " global params_widgets\n",
- " params_widgets.append(widgets.Textarea(description=key, value=str(parameter_dict[key]), layout=Layout(width='400px', height='100px'), grid_area='hyperparams'))\n",
- "\n",
- "\n",
- "# Add fields for IP addresses\n",
- "local_or_remote = widgets.Box([\n",
- " HTML(value = '<{size}>Run this experiment locally or on remote machines?'.format(size='h4'),\n",
- " layout = Layout(width='auto')),\n",
- " widgets.Dropdown(\n",
- " options=['Choose your option','Run Locally', 'Run on Remote Machines'],\n",
- " description='',\n",
- " disabled=False,\n",
- " layout=Layout(width='200px')\n",
- " )\n",
- "])\n",
- "\n",
- "\n",
- "# dictionary for details of the run, which will get populated as fields get filled\n",
- "run_details = {}\n",
- "\n",
- "\n",
- "def network_details_tracker(change): \n",
- " value = change.new\n",
- " subkey = change.owner.description.split(':')[0].replace(' ', '_').lower()\n",
- " machine_key = change.owner.placeholder.split(' ')[-1]\n",
- " # update the run_details dict, depending on whether it already has some details:\n",
- " if len(run_details['machines'][machine_key].keys()) == 0:\n",
- " temp_dict = {}\n",
- " temp_dict[subkey] = value\n",
- " run_details['machines'][machine_key] = temp_dict\n",
- " else:\n",
- " temp_dict = run_details['machines'][machine_key]\n",
- " temp_dict[subkey] = value\n",
- " run_details['machines'][machine_key] = temp_dict\n",
- "\n",
- "\n",
- "def get_IPaddr_port(party_index=None):\n",
- " placeholder_suffix = ' for machine' + str(party_index)\n",
- "\n",
- " ip_addr = widgets.Text(value='', placeholder='IP Address' + placeholder_suffix, description='IP Address:')\n",
- " port_num = widgets.Text(value='', placeholder='Port Number' + placeholder_suffix, description='Port Number:')\n",
- " ssh_user = widgets.Text(value='', placeholder='ssh username' + placeholder_suffix, description='SSH user:')\n",
- " \n",
- " machine_detail_vbox = widgets.VBox(children=[ip_addr, port_num, ssh_user])\n",
- " [machine_detail_vbox.children[i].observe(network_details_tracker, 'value') for i in range(len(machine_detail_vbox.children))]\n",
- " return machine_detail_vbox\n",
- " \n",
- "\n",
- "def path_details_tracker(change):\n",
- " value = change.new\n",
- " subkey = change.owner.description.split(':')[0].replace(' ', '_').lower()\n",
- " if 'local' in change.owner.placeholder:\n",
- " # this is a local path, put within `experiments` key\n",
- " local_subkey = 'local_' + subkey\n",
- " run_details['experiments'][0][local_subkey] = value # there's only one trial for now\n",
- " else:\n",
- " # this is a machine path\n",
- " # update the run_details dict, depending on whether it already has some details:\n",
- " machine_key = change.owner.placeholder.split(' ')[-1] # to figure which machine is this for\n",
- " if len(run_details['machines'][machine_key].keys())==0:\n",
- " temp_dict = {}\n",
- " temp_dict[subkey] = value\n",
- " run_details['machines'][machine_key] = temp_dict\n",
- " else:\n",
- " temp_dict = run_details['machines'][machine_key]\n",
- " temp_dict[subkey] = value\n",
- " run_details['machines'][machine_key] = temp_dict\n",
- "\n",
- "\n",
- "def get_paths(party_index=None):\n",
- " if party_index is None:\n",
- " placeholder_suffix = ' for local machine'\n",
- " else:\n",
- " placeholder_suffix = ' for machine' + str(party_index)\n",
- "\n",
- " config_path = widgets.Text(value='', placeholder='Staging Dir' + placeholder_suffix, description='Staging Dir:')\n",
- " code_path = widgets.Text(value='', placeholder='IBMFL Dir' + placeholder_suffix, description='IBMFL Dir:')\n",
- " \n",
- " machine_detail_vbox = widgets.VBox(children=[config_path, code_path])\n",
- " [machine_detail_vbox.children[i].observe(path_details_tracker, 'value') for i in range(len(machine_detail_vbox.children))]\n",
- " return machine_detail_vbox\n",
- "\n",
- "\n",
- "networking_deets_box = widgets.VBox()\n",
- "\n",
- "\n",
- "def venv_box_isConda_handler(change):\n",
- " if change.new == 'Yes':\n",
- " run_details['machines']['venv_uses_conda'] = True\n",
- " else:\n",
- " run_details['machines']['venv_uses_conda'] = False\n",
- "\n",
- "\n",
- "def venv_box_venvPath_handler(change):\n",
- " run_details['machines']['venv_dir'] = change.new\n",
- "\n",
- "\n",
- "def display_conda_venv_fields():\n",
- " venv_box = widgets.HBox([\n",
- " widgets.RadioButtons(\n",
- " options=['Yes', 'No'],\n",
- " description='Use conda?'\n",
- " ),\n",
- " widgets.Text(\n",
- " value='',\n",
- " placeholder='.venv or conda env name',\n",
- " description='virtual env:',\n",
- " layout=Layout(width='300px', height='auto')\n",
- " )\n",
- " ])\n",
- " venv_box.children[0].observe(venv_box_isConda_handler, 'value')\n",
- " venv_box.children[1].observe(venv_box_venvPath_handler, 'value')\n",
- " return venv_box\n",
- "\n",
- "\n",
- "def run_details_text_handler(change):\n",
- " # print(change.new)\n",
- " global run_details\n",
- " try:\n",
- " run_details = json.loads(change.new)\n",
- " except JSONDecodeError:\n",
- " if change.new == '':\n",
- " pass\n",
- " else:\n",
- " display('Incorrect JSON passed for remote details, check and retry!')\n",
- " ## Todo: use an Output widget here so the message goes away once the input JSON is changed\n",
- "\n",
- "\n",
- "def machines_dropdown_eventhandler(change):\n",
- " # print(change.new)\n",
- " agg_machine = change.new.lower()\n",
- " run_details['experiments'][0]['agg_machine'] = agg_machine # there is only one trial for now\n",
- " party_machines = []\n",
- " for machine in run_details['machines']:\n",
- " party_machines.append(machine)\n",
- " \n",
- " # now remove the agg machine from the dict\n",
- " party_machines.remove(agg_machine)\n",
- " run_details['experiments'][0]['party_machines'] = party_machines # there is only one trial for now\n",
- "\n",
- "\n",
- "def display_run_details(change):\n",
- " change.owner.disabled = True\n",
- " run_details['machines'] = {}\n",
- " run_details['machines']['venv_uses_conda'] = True\n",
- " run_details['machines']['venv_dir'] = '.venv'\n",
- " run_details['experiments'] = []\n",
- " \n",
- " temp_exp_dict = {}\n",
- " temp_exp_dict['local_staging_dir'] = ''\n",
- " temp_exp_dict['local_ibmfl_dir'] = ''\n",
- " conda_fields = display_conda_venv_fields()\n",
- " \n",
- " if 'Remote' in change.new:\n",
- " ## remote execution\n",
- " ## initialise the run_details dictionary\n",
- " run_details['isLocalRun'] = False\n",
- "\n",
- " temp_exp_dict['agg_machine'] = ''\n",
- " temp_exp_dict['party_machines'] = []\n",
- "\n",
- " for eachMachine in range(nb_config['parties'] + 1):\n",
- " run_details['machines']['machine' + str(eachMachine + 1)] = {}\n",
- " run_details['machines']['machine' + str(eachMachine + 1)]['ip_address'] = ''\n",
- " run_details['machines']['machine' + str(eachMachine + 1)]['port_number'] = ''\n",
- " run_details['machines']['machine' + str(eachMachine + 1)]['ssh_username'] = ''\n",
- " run_details['machines']['machine' + str(eachMachine + 1)]['staging_dir'] = ''\n",
- " run_details['machines']['machine' + str(eachMachine + 1)]['ibmfl_dir'] = ''\n",
- "\n",
- " networking_header_1 = HTML(value='<{size}>Details for remote execution: Fill details into the textbox on the left or in individual fields on the right'.format(size='h4'), layout=Layout(width='auto'))\n",
- "\n",
- " run_details_box = widgets.VBox([\n",
- " widgets.Label(value='Machine details:', layout=Layout(width='auto')),\n",
- " widgets.Textarea(value=json.dumps(run_details, indent=4), layout=Layout(width='300px', height='700px'))\n",
- " ])\n",
- " run_details_box.children[1].observe(run_details_text_handler, 'value')\n",
- "\n",
- " networking_header_2=HTML(value = '<{size}>OR'.format(size='h3'), layout=Layout(width='auto', margin='5px 15px 5px 15px'))\n",
- "\n",
- " all_machines_tuple = ()\n",
- " for eachMachine in range(nb_config['parties'] + 1):\n",
- " machine_header = HTML(value='<{size}>Machine{id}'.format(size='h4', id=str(eachMachine+1)))\n",
- " temp_machine_box = widgets.VBox()\n",
- " machine_IP = get_IPaddr_port(eachMachine+1)\n",
- " machine_paths = get_paths(eachMachine+1)\n",
- " temp_machine_box.children = (machine_header, widgets.HBox(children=[machine_IP, machine_paths]))\n",
- " all_machines_tuple = all_machines_tuple + (temp_machine_box,)\n",
- " \n",
- " machines_dropdown = widgets.Box([\n",
- " widgets.Label(\n",
- " value='Pick machine for running Aggregator:',\n",
- " layout=Layout(width='auto')\n",
- " ),\n",
- " widgets.Dropdown(\n",
- " options=[''] + ['Machine{id}'.format(id=i+1) for i in range(nb_config['parties']+1)],\n",
- " layout=Layout(width='auto')\n",
- " )])\n",
- " \n",
- " machines_dropdown.children[1].observe(machines_dropdown_eventhandler, 'value')\n",
- "\n",
- " temp_local_vbox = widgets.VBox()\n",
- " local_header = HTML(value='<{size}>Local Directories'.format(size='h4'))\n",
- " local_path_fields = get_paths()\n",
- " temp_local_vbox.children = (local_header, local_path_fields)\n",
- "\n",
- " networking_fields_vbox = widgets.VBox(layout=Layout(width='auto', border='0.5px solid black'))\n",
- " networking_fields_vbox.children=(conda_fields,) + all_machines_tuple + (machines_dropdown, temp_local_vbox,)\n",
- " networking_deets_hbox = widgets.HBox(children=[run_details_box, networking_header_2, networking_fields_vbox])\n",
- " save_generate_butn.layout = Layout(width='185px', height='40px', margin='5px 50px 5px 400px')\n",
- " networking_deets_box.children=(networking_header_1, networking_deets_hbox, save_generate_butn,)\n",
- " run_details['experiments'].append(temp_exp_dict)\n",
- " \n",
- " else:\n",
- " ## local execution\n",
- " run_details['isLocalRun'] = True\n",
- " temp_exp_dict['agg_machine'] = 'local0'\n",
- " temp_exp_dict['party_machines'] = ['local{id}'.format(id=i+1) for i in range(nb_config['parties'])]\n",
- " \n",
- " ## setup dicts to populate IP addr and port number from generated configs later\n",
- " run_details['machines']['local0']={}\n",
- " for party in temp_exp_dict['party_machines']:\n",
- " run_details['machines'][party] = {}\n",
- " \n",
- " networking_header = HTML(value = '<{size}>Details for local execution'.format(size='h4'), layout=Layout(width='auto'))\n",
- "\n",
- " local_paths = get_paths()\n",
- " save_generate_butn.layout = Layout(width='185px', height='40px', margin='5px 50px 5px 50px')\n",
- " networking_deets_box.children=(networking_header, conda_fields, local_paths, save_generate_butn)\n",
- " \n",
- " run_details['experiments'].append(temp_exp_dict)\n",
- "\n",
- " \n",
- " with input_ui:\n",
- " display(partyDetails_grid)\n",
- " \n",
- "\n",
- "local_or_remote.children[1].observe(display_run_details, 'value')\n",
- "\n",
- "\n",
- "def display_configs_before_run(b):\n",
- " input_ui.clear_output()\n",
- " agg_conf_path, party_conf_path = generate_update_configs()\n",
- " if agg_conf_path is None or party_conf_path is None:\n",
- " with input_ui:\n",
- " display('Error generating configs. Exiting...')\n",
- " else:\n",
- " display_configs(agg_conf_path, party_conf_path)\n",
- " with input_ui:\n",
- " display(display_grid_1)\n",
- "\n",
- "\n",
- "save_generate_butn = widgets.Button(\n",
- " description='Proceed to generate configs',\n",
- " disabled=False,\n",
- " button_style='warning', # 'success', 'info', 'warning', 'danger' or ''\n",
- " tooltip='Generates config files from the above details',\n",
- " layout=Layout(width='185px', height='40px', margin='10px')\n",
- " )\n",
- "\n",
- "\n",
- "save_generate_butn.on_click(display_configs_before_run)\n",
- "\n",
- "\n",
- "def confirmation_button_handler(b):\n",
- " b.disabled = True\n",
- " b.description = 'Confirming hyperparams...'\n",
- " global params_widgets, nb_config\n",
- " for widget in params_widgets:\n",
- " nb_config[widget.description] = widget.value\n",
- " \n",
- " input_ui.clear_output()\n",
- "\n",
- " with input_ui:\n",
- " display(local_or_remote)\n",
- "\n",
- "\n",
- "def generate_update_configs():\n",
- " # Get timestamp and add it to the given local staging directory:\n",
- " nb_config['timestamp_str'] = ibmfl_runner.Runner().generate_timestamp()\n",
- " trial_dir = run_details['experiments'][0]['local_staging_dir'] + '/' + nb_config['timestamp_str']\n",
- " \n",
- " # Create the staging_directory:\n",
- " mkdir_cmd = 'mkdir -p ' + trial_dir\n",
- " process = subprocess.run(mkdir_cmd, shell=True,\n",
- " stdout=subprocess.PIPE,\n",
- " stderr=subprocess.PIPE)\n",
- " if process.returncode!=0:\n",
- " with input_ui:\n",
- " display('Erred: ', process.stderr)\n",
- " return None, None\n",
- " \n",
- " # Generate Data\n",
- " with input_ui:\n",
- " display('Generating Data...')\n",
- "\n",
- " cmd_to_run = 'cd ../../; python3 examples/generate_data.py --num_parties ' + str(nb_config['parties']) + ' -d ' + nb_config['dataset'] + ' -pp ' + str(nb_config['split']['ppp']) + ' -p ' + trial_dir # there's only one trial for now\n",
- " if 'Stratified' in nb_config['split']['method']:\n",
- " cmd_to_run = cmd_to_run + ' --stratify'\n",
- "\n",
- " # with input_ui:\n",
- " # display('Executing {}'.format(cmd_to_run))\n",
- " process = subprocess.run(cmd_to_run, shell=True,\n",
- " stdout=subprocess.PIPE, \n",
- " stderr=subprocess.PIPE)\n",
- " if process.returncode!=0:\n",
- " with input_ui:\n",
- " display('Erred: ', process.stderr)\n",
- " return None, None\n",
- "\n",
- " # path to get datasets from\n",
- " data_path = str(process.stdout).split('Data saved in')[-1].strip().replace('\\\\n\\'', '')\n",
- " with input_ui:\n",
- " display('Datasets saved to: {}'.format(data_path))\n",
- "\n",
- " # Generate Configs:\n",
- " with input_ui:\n",
- " display('Generating Configs...')\n",
- " if 'crypto' in nb_config['fusion_identifier']:\n",
- " # it is either of crypto keras or crypto_multiclass_keras, so need -crypto flags:\n",
- " # Todo: Need to let user pick one of {Paillier, ThresholdPaillier}\n",
- " cmd_to_run = 'cd ../../; python3 examples/generate_configs.py --num_parties ' + str(nb_config['parties']) + ' -f ' + nb_config['fusion_identifier'] + ' -m ' + uimodel_modelid_dict[nb_config['model']] + ' -crypto Paillier -d ' + nb_config['dataset'] + ' -p ' + data_path + ' --config_path ' + trial_dir # there's only one trial for now\n",
- " else:\n",
- " cmd_to_run = 'cd ../../; python3 examples/generate_configs.py --num_parties ' + str(nb_config['parties']) + ' -f ' + nb_config['fusion_identifier'] + ' -m ' + uimodel_modelid_dict[nb_config['model']] + ' -d ' + nb_config['dataset'] + ' -p ' + data_path + ' --config_path ' + trial_dir # there's only one trial for now\n",
- "\n",
- " # print('Executing {}'.format(cmd_to_run))\n",
- " process = subprocess.run(cmd_to_run, shell=True,\n",
- " stdout=subprocess.PIPE, \n",
- " stderr=subprocess.PIPE,\n",
- " universal_newlines=True)\n",
- " if process.returncode==0:\n",
- " # save agg and party configs path\n",
- " configs_path = os.path.dirname(process.stdout.split('\\n')[0].split(':')[1].strip())\n",
- " path_to_save_agg_configs = configs_path + '/config_agg.yml'\n",
- " print('Aggregator configs saved to: {}'.format(path_to_save_agg_configs))\n",
- " path_to_save_party_configs = configs_path + '/config_party*.yml'\n",
- " print('Party configs saved to: {}'.format(path_to_save_party_configs))\n",
- " else:\n",
- " with input_ui:\n",
- " display('Erred: ', process.stderr)\n",
- " return None, None\n",
- "\n",
- " # modify hyperparameter text to fix quotes\n",
- " hyp_text = nb_config['global']\n",
- " hyp_text = json.loads(hyp_text.replace('\\'', '\"'))\n",
- " nb_config['global'] = hyp_text\n",
- " hyp_text = nb_config['local']\n",
- " hyp_text = json.loads(hyp_text.replace('\\'', '\"'))\n",
- " if nb_config['fusion_identifier'] == 'fedplus':\n",
- " alpha = hyp_text['training'].pop('alpha')\n",
- " nb_config['local'] = hyp_text\n",
- " \n",
- " # add num_parties as a key under global, to match the structure in the agg yaml configs\n",
- " val = nb_config.pop('parties')\n",
- " nb_config['global']['num_parties'] = val\n",
- " val = nb_config.pop('quorum')\n",
- " nb_config['global']['perc_quorum'] = val\n",
- " \n",
- " # Load Aggregator Config\n",
- " with open(path_to_save_agg_configs, 'r') as stream:\n",
- " try:\n",
- " agg_config = yaml.safe_load(stream)\n",
- " except yaml.YAMLError as e:\n",
- " print(e)\n",
- " return None, None\n",
- "\n",
- " # for local runs, update the dirs to all the \"machines\" (they're all local)\n",
- " if run_details['isLocalRun']:\n",
- " run_details['machines']['ibmfl_dir'] = run_details['experiments'][0]['local_ibmfl_dir']\n",
- " run_details['machines']['staging_dir'] = run_details['experiments'][0]['local_staging_dir']\n",
- "\n",
- " # Modify aggregator config with values captured from the UI:\n",
- " # - update the hyperparameters object with newer global and local objects as updated above\n",
- " # - update ip and port from the run_details object\n",
- " # - TODO: Update model spec when uploading model file is supported\n",
- " agg_config['hyperparams']['global'] = nb_config['global']\n",
- " agg_config['hyperparams']['local'] = nb_config['local'] \n",
- " agg_machine = run_details['experiments'][0]['agg_machine'] # there's only one trial for now\n",
- "\n",
- " if not run_details['isLocalRun']:\n",
- " agg_config['connection']['info']['ip'] = run_details['machines'][agg_machine]['ip_address']\n",
- " agg_config['connection']['info']['port'] = int(run_details['machines'][agg_machine]['port_number'])\n",
- " else:\n",
- " run_details['machines'][agg_machine]['ip_address'] = agg_config['connection']['info']['ip']\n",
- " run_details['machines'][agg_machine]['port_number'] = agg_config['connection']['info']['port']\n",
- " run_details['machines'][agg_machine]['ssh_username'] = os.getenv('USER')\n",
- "\n",
- " \n",
- " # Write this updated yaml to file\n",
- " with open(path_to_save_agg_configs, 'w') as out:\n",
- " yaml.safe_dump(agg_config, out, default_flow_style=False)\n",
- " with input_ui:\n",
- " display('Updated Aggregator config at {}'.format(path_to_save_agg_configs))\n",
- "\n",
- "\n",
- " # Modify party config with values accepted from the UI\n",
- " # - update IP address, port for agg and party as received from the UI (only remote runs)\n",
- " # - add metrics section (both remote and local run)\n",
- " # - add alpha, if model chosen is Fed+\n",
- " if not run_details['isLocalRun']:\n",
- " currParty = 0\n",
- " for eachMachine in run_details['experiments'][0]['party_machines']: # there's only one trial for now\n",
- " # Load\n",
- " with open(path_to_save_party_configs.replace('*', str(currParty))) as stream:\n",
- " try:\n",
- " party_config = yaml.safe_load(stream)\n",
- " except yaml.YAMLError as e:\n",
- " print(e)\n",
- " return None, None\n",
- "\n",
- " agg_machine = run_details['experiments'][0]['agg_machine'] # there's only one trial for now\n",
- " # Modify\n",
- " party_config['aggregator']['ip'] = run_details['machines'][agg_machine]['ip_address']\n",
- " party_config['aggregator']['port'] = run_details['machines'][agg_machine]['port_number']\n",
- " \n",
- " party_config['connection']['info']['ip'] = run_details['machines'][eachMachine]['ip_address']\n",
- " party_config['connection']['info']['port'] = int(run_details['machines'][eachMachine]['port_number'])\n",
- " party_config['connection']['info']['port'] = int(run_details['machines'][eachMachine]['port_number'])\n",
- "\n",
- " # Metrics section to add to each party config\n",
- " party_config['metrics_recorder']={}\n",
- " party_config['metrics_recorder']['name'] = 'MetricsRecorder'\n",
- " party_config['metrics_recorder']['path'] = 'ibmfl.party.metrics.metrics_recorder'\n",
- " party_config['metrics_recorder']['output_file'] = '${config_dir}/metrics_party${id}'.replace('${config_dir}', run_details['machines'][eachMachine]['staging_dir']).replace('${id}', str(currParty))\n",
- " party_config['metrics_recorder']['output_type'] = 'json'\n",
- " party_config['metrics_recorder']['compute_pre_train_eval'] = False\n",
- " party_config['metrics_recorder']['compute_post_train_eval'] = True\n",
- "\n",
- " if nb_config['fusion_identifier'] == 'fedplus':\n",
- " party_config['local_training']['info']['alpha'] = alpha\n",
- "\n",
- " # Finally, write updated agg config to file\n",
- " with open(path_to_save_party_configs.replace('*', str(currParty)), 'w') as out:\n",
- " yaml.safe_dump(party_config, out, default_flow_style=False)\n",
- " currParty += 1\n",
- " else:\n",
- " currParty = 0\n",
- " for eachMachine in run_details['experiments'][0]['party_machines']: # there's only one trial for now\n",
- " # Load\n",
- " with open(path_to_save_party_configs.replace('*', str(currParty))) as stream:\n",
- " try:\n",
- " party_config = yaml.safe_load(stream)\n",
- " except yaml.YAMLError as e:\n",
- " print(e)\n",
- " return None, None\n",
- "\n",
- "\n",
- " # save IP addr and port number from the party config, into `run_details` dict, for runner's use\n",
- " run_details['machines'][eachMachine]['ip_address'] = party_config['connection']['info']['ip']\n",
- " run_details['machines'][eachMachine]['port_number'] = party_config['connection']['info']['port']\n",
- " run_details['machines'][eachMachine]['ssh_username'] = os.getenv('USER')\n",
- "\n",
- " # Metrics section to add to each party config\n",
- " party_config['metrics_recorder']={}\n",
- " party_config['metrics_recorder']['name'] = 'MetricsRecorder'\n",
- " party_config['metrics_recorder']['path'] = 'ibmfl.party.metrics.metrics_recorder'\n",
- " party_config['metrics_recorder']['output_file'] = '${config_dir}/metrics_party${id}'.replace('${config_dir}', trial_dir).replace('${id}', str(currParty))\n",
- " party_config['metrics_recorder']['output_type'] = 'json'\n",
- " party_config['metrics_recorder']['compute_pre_train_eval'] = False\n",
- " party_config['metrics_recorder']['compute_post_train_eval'] = True\n",
- "\n",
- " if nb_config['fusion_identifier'] == 'fedplus':\n",
- " party_config['local_training']['info']['alpha'] = alpha\n",
- " \n",
- " # Finally, write updated party config to file\n",
- " with open(path_to_save_party_configs.replace('*', str(currParty)), 'w') as out:\n",
- " yaml.safe_dump(party_config, out, default_flow_style=False)\n",
- "\n",
- " currParty += 1\n",
- "\n",
- " with input_ui:\n",
- " display('Updated Party configs at {}'.format(path_to_save_party_configs))\n",
- "\n",
- " nb_config['local_conf_dir'] = str(os.path.dirname(path_to_save_agg_configs))\n",
- "\n",
- " return path_to_save_agg_configs, path_to_save_party_configs\n",
- "\n",
- "config_box = widgets.VBox(layout=Layout(width='auto'))\n",
- "\n",
- "\n",
- "def display_configs(agg_conf_path, party_conf_path):\n",
- " # Display aggregator and party* configs\n",
- " display_header = HTML(value='<{size}>Configs Generated:'.format(size='h4'), layout=Layout(width='auto'))\n",
- "\n",
- " agg_conf_header = HTML(value='<{size}>Aggregator Config'.format(size='h4'), layout=Layout(width='auto'))\n",
- " agg_conf = widgets.Output(layout={'border': '0.5px solid black'})\n",
- "\n",
- " # read agg config from filesystem:\n",
- " with open(agg_conf_path) as stream:\n",
- " try:\n",
- " agg_config = yaml.safe_load(stream)\n",
- " except yaml.YAMLError as e:\n",
- " print(e)\n",
- "\n",
- " with agg_conf:\n",
- " display(agg_config)\n",
- "\n",
- " party_conf_header = HTML(value='<{size}>Party0 Config'.format(size='h4'), layout=Layout(width='auto'))\n",
- " party_conf = widgets.Output(layout={'border': '0.5px solid black'}) \n",
- "\n",
- " # read party0 from filesystem:\n",
- " with open(party_conf_path.replace('*', '0')) as stream:\n",
- " try:\n",
- " party_config = yaml.safe_load(stream)\n",
- " except yaml.YAMLError as e:\n",
- " print(e)\n",
- "\n",
- " ## display\n",
- " with party_conf:\n",
- " display(party_config)\n",
- "\n",
- " agg_box = widgets.HBox(children=[agg_conf_header, agg_conf], layout=Layout(width='auto', padding='20px'))\n",
- " party_box = widgets.HBox(children=[party_conf_header, party_conf], layout=Layout(width='auto', padding='10px'))\n",
- " party_disclmr_1 = HTML(value='Other parties follow config similar to Party0, except connection.info.[ip,port] and paths', \n",
- " layout=Layout(width='auto'))\n",
- " party_disclmr_2 = HTML(value='Also, each party gets a separate dataset file, split from the chosen dataset', \n",
- " layout=Layout(width='auto'))\n",
- " config_box.children=[display_header, agg_box, party_box, party_disclmr_1, party_disclmr_2, run_butn]\n",
- "\n",
- "\n",
- "run_butn = widgets.Button(\n",
- " description='Run Experiment',\n",
- " disabled=False,\n",
- " button_style='warning',\n",
- " tooltip='Runs the experiment with above config',\n",
- " layout=Layout(width='125px', height='40px', margin='5px 50px 5px 400px') # margin to position button centrally\n",
- " )\n",
- "\n",
- "\n",
- "monitoring_box = widgets.VBox()\n",
- "\n",
- "plot_button = widgets.Button(\n",
- " description='Show Charts',\n",
- " disabled=False,\n",
- " button_style='warning', # 'success', 'info', 'warning', 'danger' or ''\n",
- " tooltip='Displays the various plots for the experiment that ran',\n",
- " layout = Layout(width='120px', height='40px', margin='5px 50px 5px 400px') ## margin to position button centrally\n",
- " )\n",
- "\n",
- "\n",
- "def invoke_runner(b):\n",
- " b.disabled = True\n",
- " input_ui.clear_output()\n",
- " monitoring_out = widgets.Output(layout={'border': '0.5px solid black'})\n",
- " monitoring_box.children = [monitoring_out]\n",
- " display(display_grid_2)\n",
- "\n",
- " # some values needed by the Runner; there's only one trial for now\n",
- " run_details['experiments'][0]['shuffle_party_machines'] = False\n",
- " run_details['experiments'][0]['n_trials'] = 1\n",
- " run_details['experiments'][0]['n_parties'] = nb_config['global']['num_parties']\n",
- " run_details['experiments'][0]['n_rounds'] = nb_config['global']['rounds']\n",
- "\n",
- " # values for postprocessing and showing default metrics\n",
- " run_details['experiments'][0]['postproc_fn'] = {}\n",
- " run_details['experiments'][0]['postproc_fn'] = 'gen_reward_vs_time_plots'\n",
- " run_details['experiments'][0]['postproc_x_key'] = 'post_train:ts'\n",
- " run_details['experiments'][0]['postproc_y_keys'] = ['post_train:eval:loss', 'post_train:eval:acc']#, 'post_train:eval:precision weighted', 'post_train:eval:recall weighted']\n",
- "\n",
- " exp_machines = exp_runner.convert_machine_dict_from_nb_to_cli(run_details['machines'])\n",
- "\n",
- " for exp_info in run_details['experiments']:\n",
- " with open('{}/config_agg.yml'.format(nb_config['local_conf_dir']), 'r') as config_agg_file:\n",
- " config_agg = config_agg_file.read()\n",
- " config_parties = []\n",
- " for pi in range(exp_info['n_parties']):\n",
- " with open('{}/config_party{}.yml'.format(nb_config['local_conf_dir'], pi), 'r') as config_party_file:\n",
- " config_parties += [config_party_file.read()]\n",
- " with monitoring_out:\n",
- " display(exp_runner.run_experiment(exp_info, run_details['machines'],\n",
- " config_agg, config_parties, ui_mode='nb', ts=nb_config['timestamp_str']) \\\n",
- " or 'Finished!')\n",
- "\n",
- " if 'Keras' in nb_config['model']:\n",
- " monitoring_box.children = monitoring_box.children + (plot_button,)\n",
- " else:\n",
- " with monitoring_out:\n",
- " display('No plots to show for the chosen model')\n",
- "\n",
- "\n",
- "plots_box = widgets.VBox()\n",
- "\n",
- "\n",
- "def get_plots(b):\n",
- " b.disabled = True\n",
- " no_plots_for_these = ['Federated Averaging', 'Gradient Averaging', 'Probabilistic Federated Neural Matching']\n",
- " plots_out = widgets.Output(layout={'border': '0.5px solid black'})\n",
- " plots_box.children = [plots_out]\n",
- " display(display_grid_3)\n",
- " if nb_config['fusion'] in no_plots_for_these:\n",
- " with plots_out:\n",
- " display('Plots for chosen fusion algorithm are not supported yet') # metrics processing not in place\n",
- " else:\n",
- " # generate the plot\n",
- " with plots_out:\n",
- " display(exp_info = exp_runner.call_postproc_fn())\n",
- "\n",
- "\n",
- "run_butn.on_click(invoke_runner)\n",
- "\n",
- "\n",
- "plot_button.on_click(get_plots)\n",
- "\n",
- "\n",
- "# GridBox layout for UI\n",
- "grid = GridspecLayout(2,3)\n",
- "\n",
- "grid[0,:] = GridBox(children=[model_header, model_dropdown, #upload_model_file, \n",
- " dataset_header, dataset_dropdown, splitting_dropdown, points_slider,\n",
- " fusion_dropdown,\n",
- " header_parties, num_parties, parties_in_quorum,\n",
- "# header_postproc, postproc_func, postproc_xkey, postproc_ykeys,\n",
- " gen_hyperparams\n",
- " ],\n",
- " layout = Layout(\n",
- " width='100%',\n",
- " grid_template_rows='auto auto',\n",
- " grid_template_columns='48% 48%',\n",
- " grid_template_areas='''\n",
- " \"model_header model_header\"\n",
- " \"model_dr model_dr\"\n",
- " \"dataset_header dataset_header\"\n",
- " \"dataset dataset_spl\"\n",
- " \"fusion_dr fusion_dr\"\n",
- " \"header_parties header_parties\"\n",
- " \"parties parties\"\n",
- " \"gen_hyper gen_hyper\"\n",
- " ''')\n",
- " )\n",
- "# Nested grid to vary spacing across various widgets\n",
- "sub_grid_hyperparams = GridspecLayout(2,3)\n",
- "sub_grid_hyperparams[0,:] = hyperparams_text\n",
- "sub_grid_hyperparams[1,1] = confirmation_box\n",
- "\n",
- "grid[1, :] = sub_grid_hyperparams\n",
- "\n",
- "input_ui = widgets.Output()\n",
- "\n",
- "with input_ui:\n",
- " display(grid)\n",
- "\n",
- "# grid for displaying networking fields -- IP addr, port, ssh user, paths\n",
- "partyDetails_grid = GridspecLayout(1,3)\n",
- "partyDetails_grid[0, :] = networking_deets_box\n",
- "\n",
- "# grid for displaying generated configuration\n",
- "display_grid_1 = GridspecLayout(1,3)\n",
- "display_grid_1[0, :] = config_box\n",
- "\n",
- "# grid for displaying progress of running experiment\n",
- "display_grid_2 = GridspecLayout(1,1)\n",
- "display_grid_2[0, :] = monitoring_box\n",
- "\n",
- "# grid for displaying charts from collected metrics\n",
- "display_grid_3 = GridspecLayout(1,1)\n",
- "display_grid_3[0, :] = plots_box\n",
- "\n",
- "input_ui"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.6.8"
- },
- "varInspector": {
- "cols": {
- "lenName": 16,
- "lenType": 16,
- "lenVar": 40
- },
- "kernels_config": {
- "python": {
- "delete_cmd_postfix": "",
- "delete_cmd_prefix": "del ",
- "library": "var_list.py",
- "varRefreshCmd": "print(var_dic_list())"
- },
- "r": {
- "delete_cmd_postfix": ") ",
- "delete_cmd_prefix": "rm(",
- "library": "var_list.r",
- "varRefreshCmd": "cat(var_dic_list()) "
- }
- },
- "types_to_exclude": [
- "module",
- "function",
- "builtin_function_or_method",
- "instance",
- "_Feature"
- ],
- "window_display": false
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/runner/exp_manager/images/ss_1.png b/runner/exp_manager/images/ss_1.png
deleted file mode 100644
index e24d880..0000000
Binary files a/runner/exp_manager/images/ss_1.png and /dev/null differ
diff --git a/runner/exp_manager/images/ss_10_i.png b/runner/exp_manager/images/ss_10_i.png
deleted file mode 100644
index 4eb89c3..0000000
Binary files a/runner/exp_manager/images/ss_10_i.png and /dev/null differ
diff --git a/runner/exp_manager/images/ss_10_ii.png b/runner/exp_manager/images/ss_10_ii.png
deleted file mode 100644
index b85b63f..0000000
Binary files a/runner/exp_manager/images/ss_10_ii.png and /dev/null differ
diff --git a/runner/exp_manager/images/ss_10_iii.png b/runner/exp_manager/images/ss_10_iii.png
deleted file mode 100644
index 1b20b8a..0000000
Binary files a/runner/exp_manager/images/ss_10_iii.png and /dev/null differ
diff --git a/runner/exp_manager/images/ss_2.png b/runner/exp_manager/images/ss_2.png
deleted file mode 100644
index 0e30ba8..0000000
Binary files a/runner/exp_manager/images/ss_2.png and /dev/null differ
diff --git a/runner/exp_manager/images/ss_3.png b/runner/exp_manager/images/ss_3.png
deleted file mode 100644
index 1f24d4c..0000000
Binary files a/runner/exp_manager/images/ss_3.png and /dev/null differ
diff --git a/runner/exp_manager/images/ss_4.png b/runner/exp_manager/images/ss_4.png
deleted file mode 100644
index 8f43dfd..0000000
Binary files a/runner/exp_manager/images/ss_4.png and /dev/null differ
diff --git a/runner/exp_manager/images/ss_5.png b/runner/exp_manager/images/ss_5.png
deleted file mode 100644
index f65cc7e..0000000
Binary files a/runner/exp_manager/images/ss_5.png and /dev/null differ
diff --git a/runner/exp_manager/images/ss_6.png b/runner/exp_manager/images/ss_6.png
deleted file mode 100644
index 3a698d2..0000000
Binary files a/runner/exp_manager/images/ss_6.png and /dev/null differ
diff --git a/runner/exp_manager/images/ss_7.png b/runner/exp_manager/images/ss_7.png
deleted file mode 100644
index cde4d70..0000000
Binary files a/runner/exp_manager/images/ss_7.png and /dev/null differ
diff --git a/runner/exp_manager/images/ss_8.png b/runner/exp_manager/images/ss_8.png
deleted file mode 100644
index 597147b..0000000
Binary files a/runner/exp_manager/images/ss_8.png and /dev/null differ
diff --git a/runner/exp_manager/images/ss_9_i.png b/runner/exp_manager/images/ss_9_i.png
deleted file mode 100644
index 6d9454d..0000000
Binary files a/runner/exp_manager/images/ss_9_i.png and /dev/null differ
diff --git a/runner/exp_manager/images/ss_9_ii.png b/runner/exp_manager/images/ss_9_ii.png
deleted file mode 100644
index 936c215..0000000
Binary files a/runner/exp_manager/images/ss_9_ii.png and /dev/null differ
diff --git a/runner/exp_manager/supported_models.csv b/runner/exp_manager/supported_models.csv
deleted file mode 100644
index 36b94ac..0000000
--- a/runner/exp_manager/supported_models.csv
+++ /dev/null
@@ -1,7 +0,0 @@
-fusion_identifier, fusion_algo, dataset, model_spec_name, fl_model, model_ui
-coordinate_median, Coordinate-wise Median, mnist, keras-cnn, KerasFLModel, Keras
-fedavg, Federated Averaging, mnist, keras-cnn, KerasFLModel, Keras
-gradient_aggregation, Gradient Averaging, mnist, keras-cnn, KerasFLModel, Keras
-iter_avg, Iterative Averaging, mnist, keras-cnn, KerasFLModel, Keras
-krum, Krum, mnist, keras-cnn, KerasFLModel, Keras
-pfnm, Probabilistic Federated Neural Matching, mnist, keras-fc, KerasFLModel, Keras
\ No newline at end of file
diff --git a/runner/exp_manager/usage_guide.md b/runner/exp_manager/usage_guide.md
deleted file mode 100644
index 47e259a..0000000
--- a/runner/exp_manager/usage_guide.md
+++ /dev/null
@@ -1,81 +0,0 @@
-# Experiment Manager Dashboard
-## Usage Directions:
-
-#### Step 1: Run the notebook
-Follow the instructions in the README.md file to get the notebook up and running in your browser.
-
-Run the only cell in the Notebook to get the widgets displayed as shown here:
-
-Fig. 1: Experiment Manager Dashboard
-
-For better results, delete any extra cells in the Notebook. This allows the cell output to take up the whole of the remaining window. Otherwise it'd reduce in size, requiring you to scroll through the output.
-
-#### Step 2: Choose the model
-IBMFL supports a variety of models including _Keras_, _PyTorch_ and _Scikit-learn_ and you can choose your preferred model via the dropdown shown in the figure below.
-
-
-Fig. 2: Choosing a Model from the dropdown
-
-While there may be an option for the users to upload their model in future, at this point only built-in models are supported.
-
-#### Step 3: Choose dataset
-Once the model is chosen as in the previous step, the other dropdowns will get updated to reflect compatible options supported by the framework. Next, choose the dataset and splitting strategy (_uniformly random_, or _stratified_) to determine how the data must be split across parties.
-
-Fig. 3: Choosing the dataset
-
-
-#### Step 4: Select Fusion Algorithm
-Next, pick the Fusion Algorithm to be run for the experiment. The list shown in the dropdown reflects those supported for the Model and Dataset choices made in the previous steps.
-
-Fig. 4: Choosing the Fusion Algorithm
-
-
-#### Step 5: Select number of participating parties
-Next, choose the number of parties you’d like to have in the experiment, using the slider on the left. Additionally, you could use the slider on the right to choose the number of parties the aggregator will wait on (the *quorum*), when collecting responses. If left untouched, this will be equal to the number of parties chosen in the slider on the left.
-
-Fig. 5: Choosing the number of parties
-
-
-Click the `Get Hyperparameters` button to view and modify the hyperparameters corresponding to the choices made so far.
-
-Fig. 6: View/Modify the relevant hyperparameters
-
-
-Finally click on `Confirm Hyperparameters` to move on to the next screen.
-
-#### Step 6: Run locally or on remote machines
-Next, choose whether the experiment should be run *locally*, i.e., on the same machine as the dashboard; or *remotely*, i.e., across various remote virtual machines. In this document, we consider the `Run on Remote Machines` option, as it needs more details compared to the local run.
-
-Fig. 7: Run the experiment locally or on remote machines
-
-
-#### Step 7: Key in machine details
-For the remote run, the dashboard needs `IP address`, `port number`, `SSH username`, the `IBMFL Dir` (IBMFL project root directory) and a `Staging Dir` (a staging directory where all configuration, dataset and logs should go -- for each of the machines. Prior to filling this in, check the appropriate option depending on whether the machines use `conda`, as well as the corresponding virtual environment path.
-
-In the dropdown labelled `Pick machine for running Aggregator:`, select the machine where the Aggregator should be run. Finally, in the `Local Directories` section, add in the directories for the local `Staging Dir` and local `IBMFL Dir`. These should exist on the same machine where the dashboard Notebook is being run.
-
-All this information can be keyed in either through the fields on the right or via a JSON as shown on the left.
-
-Once done, click `Proceed to generate configs`.
-
-#### Step 8: View Aggregator and Party configs
-On the next screen, the `generate_data.py` and `generate_configs.py` scripts are invoked and the resulting aggregator and party0 files are displayed, as shown here.
-
-
-Fig. 8: Aggregator and Party0 configs
-
-If the configs look alright, click `Run Experiment` to get things running and monitor progress.
-
-#### Step 9: Monitor experiment and visualise results
-Once the experiment begins running, details about each of the connections are printed with progress bars indicating progress made in terms of the number of rounds completed from the total as well as the number of party responses in each round.
-
-Fig. 9: Monitoring progress during the experiment
-
-
-Once the experiment completes successfully, both the bars would be green and if post-processing of metrics is supported, a button labelled `Show Charts` will be visible.
-
-On clicking the button, you'll see loss and accuracy line plots for each party.
-
-Fig. 10: Plots from the experiment
-
----
\ No newline at end of file
diff --git a/runner/images/config_runner.png b/runner/images/config_runner.png
deleted file mode 100644
index 0f66f44..0000000
Binary files a/runner/images/config_runner.png and /dev/null differ
diff --git a/runner/images/max_train.png b/runner/images/max_train.png
deleted file mode 100644
index 41fef19..0000000
Binary files a/runner/images/max_train.png and /dev/null differ
diff --git a/runner/images/mean_eval.png b/runner/images/mean_eval.png
deleted file mode 100644
index f8d8dec..0000000
Binary files a/runner/images/mean_eval.png and /dev/null differ
diff --git a/runner/images/mean_train.png b/runner/images/mean_train.png
deleted file mode 100644
index d28f287..0000000
Binary files a/runner/images/mean_train.png and /dev/null differ