From 69c68901a2e7d251be5c43fa785cb44b24f1c875 Mon Sep 17 00:00:00 2001
From: timoeller <timo.moeller@deepset.ai>
Date: Tue, 18 Feb 2020 14:56:29 +0100
Subject: [PATCH 1/2] Bugfix parameter loading through config, adjust configs,
 add bert2.0 eval configs

---
 .../germEval14_config.json                    | 66 +++++++++++++++++
 .../germEval18Coarse_config.json              | 61 ++++++++++++++++
 .../germEval18Fine_config.json                | 62 ++++++++++++++++
 experiments/ner/conll2003_de_config.json      |  2 +-
 experiments/ner/conll2003_en_config.json      |  2 +-
 experiments/ner/germEval14_config.json        |  6 +-
 .../text_classification/cola_config.json      |  2 +-
 .../germEval18Coarse_config.json              |  2 +-
 .../germEval18Fine_config.json                |  2 +-
 .../text_classification/gnad_config.json      |  2 +-
 .../xlm_roberta_eval/conll2003_de_config.json |  2 +-
 .../xlm_roberta_eval/germEval14_config.json   |  2 +-
 .../germEval18Coarse_config.json              |  2 +-
 farm/experiment.py                            | 11 ++-
 farm/file_utils.py                            | 72 ++++++-------------
 15 files changed, 228 insertions(+), 68 deletions(-)
 create mode 100644 experiments/german-bert2.0-eval/germEval14_config.json
 create mode 100644 experiments/german-bert2.0-eval/germEval18Coarse_config.json
 create mode 100644 experiments/german-bert2.0-eval/germEval18Fine_config.json

diff --git a/experiments/german-bert2.0-eval/germEval14_config.json b/experiments/german-bert2.0-eval/germEval14_config.json
new file mode 100644
index 000000000..6aa05c750
--- /dev/null
+++ b/experiments/german-bert2.0-eval/germEval14_config.json
@@ -0,0 +1,66 @@
+{
+  "general": {
+    "cache_dir":  {"value": null, "default": "",                "desc": "Path for storing pre-trained models downloaded from s3."},
+    "data_dir":   {"value": null, "default": "data/germeval14", "desc": "Input directory for downstream task. Should contain train + test (+ dev) files."},
+    "output_dir": {"value": null, "default": "saved_models",    "desc": "Output directory where model predictions and checkpoints will be saved."},
+
+    "cuda":       {"value": null, "default": true,  "desc": "CUDA flag,  uses CUDA if available."},
+    "local_rank": {"value": null, "default": -1,    "desc": "If local_rank == -1 -> multiGPU mode on one machine,  other values signal distributed computation across several nodes (apex install required)."},
+    "use_amp":       {"value": null, "default": false, "desc": "Automatic mixed precision with APEX. Must be set to null to disable or to any optimisation level (see apex documentation). 'O1' is recommended."},
+    "seed":        {"value": null, "default": 42, "desc": "Random seed for initializations."}
+  }, 
+
+  "task": {
+    "name":           {"value": null, "default": "GermEval14", "desc": "Name of task."},
+    "type":           {"value": null, "default": "ner"},
+    "language":       {"value": null, "default": "de"},
+    "do_eval":        {"value": null, "default": true,         "desc": "Whether to run eval on the dev set."},
+    "do_train":       {"value": null, "default": true,         "desc": "Whether to run training. Can be used to only evaluate on an already trained model."},
+
+    "processor_name":   {"value": null, "default": "NERProcessor",   "desc": "A Dataprocessor that is suited for tabular data. Needs special data parameters defined."},
+    "dev_split":        {"value": null, "default": 0.1,              "desc": "Split a dev set from the training set using dev_split as proportion."},
+    "train_filename":   {"value": null, "default": "train.txt",      "desc": "Filename for training."},
+    "dev_filename":     {"value": null, "default": null,        "desc": "Filename for development. Missing in case of GermEval2018."},
+    "test_filename":    {"value": null, "default": "test.txt",       "desc": "Filename for testing. It is the submission file from competition."},
+    "delimiter":        {"value": null, "default": " ",              "desc": "Delimiter used to seprate columns in input data."},
+    "label_list":       {"value": null, "default": ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"], "desc": ""},
+    "metric":           {"value": null, "default": "seq_f1",       "desc": "Metric used. A f1 scored tailored to sequences of labels."}
+  }, 
+
+  "parameter": {
+    "model":           {"value": "bert-base-german-cased", "default": null,                      "desc": "Bert pre-trained model selected in the list: bert-base-uncased,  bert-large-uncased,  bert-base-cased,  bert-large-cased,  bert-base-multilingual-uncased,  bert-base-multilingual-cased,  bert-base-chinese."},
+    "prediction_head": {"value": null,                     "default": "TokenClassificationHead", "desc":  "Kind of prediction head we use on top of Language Model"},
+    "lm_output_type":  {"value": null,                     "default": "per_token",               "desc":  "Language Model output."},
+    "lower_case":      {"value": null,                     "default": false,                     "desc": "Set to true if you are using an uncased model."},
+    "max_seq_len":     {"value": null,                     "default": 128,                       "desc": "The maximum total input sequence length after WordPiece tokenization. Some GNAD texts even extend beyond 512 tokens."},
+    "balance_classes": {"value": null,                     "default": false,                     "desc": "Balance classes using weighted CrossEntropyLoss."},
+
+    "epochs":                      {"value": null, "default": 4,         "desc": "Total number of training epochs to perform."},
+    "batch_size":                  {"value": null, "default": 64,        "desc": "Total batch size for training on a single V100 GPU."},
+    "gradient_accumulation_steps": {"value": null, "default": 1,         "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."},
+    "embeds_dropout_prob":         {"value": null, "default": 0.1,       "desc": "Strength of dropout to be applied to the word embeddings generated by the language model."},
+    "layer_dims":                  {"value": null, "default": [768, 15], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
+  },
+
+  "optimizer": {
+    "learning_rate":      {"value": null, "default": 5e-5, "desc": "The learning rate for the optimizer."},
+    "optimizer_opts":     {"value": null, "default": null, "desc": "Additional optimizer config."},
+    "schedule_opts":      {"value": null, "default":  {"name": "LinearWarmup", "warmup_proportion":  0.4}, "desc":  "opts for lr schedule"}
+ },
+
+  "logging": {
+    "eval_every":    {"value": null,                                "default": 60,   "desc": "Steps per training loop (batches) required for evaluation on dev set. Set to 0 when you do not want to do evaluation on dev set during training."},
+    "mlflow_url":    {"value": "https://public-mlflow.deepset.ai/", "default": null, "desc": "Mlflow server for tracking experiments (e.g. http://80.123.45.167:5000/)"},
+    "mlflow_nested": {"value": null,                                "default": true, "desc": "Nesting mlflow experiments. For doing multiple runs across a set of hyperparameters."},
+
+    "mlflow_experiment": {"value": "Public_FARM",          "default": null, "desc": "Experiment name used for mlflow"},
+    "mlflow_run_name":   {"value": "germeval14 by config", "default": null, "desc": "Name of the particular run for mlflow"}
+  }
+}
+
+
+
+
+
+
+
diff --git a/experiments/german-bert2.0-eval/germEval18Coarse_config.json b/experiments/german-bert2.0-eval/germEval18Coarse_config.json
new file mode 100644
index 000000000..daa0e465b
--- /dev/null
+++ b/experiments/german-bert2.0-eval/germEval18Coarse_config.json
@@ -0,0 +1,61 @@
+{
+  "general": {
+    "cache_dir":  {"value": null, "default": "",                "desc": "Path for storing pre-trained models downloaded from s3."},
+    "data_dir":   {"value": null, "default": "data/germeval18", "desc": "Input directory for downstream task. Should contain train + test (+ dev) files."},
+    "output_dir": {"value": null, "default": "saved_models",    "desc": "Output directory where model predictions and checkpoints will be saved."},
+
+    "cuda":        {"value": null, "default": true,  "desc": "CUDA flag,  uses CUDA if available."},
+    "local_rank":  {"value": null, "default": -1,    "desc": "If local_rank == -1 -> multiGPU mode on one machine,  other values signal distributed computation across several nodes (apex install required)."},
+    "use_amp":        {"value": null, "default": null, "desc": "Automatic mixed precision with APEX. Must be set to null to disable or to any optimisation level (see apex documentation). 'O1' is recommended."},
+    "seed":         {"value": null, "default": 42, "desc": "Random seed for initializations."}
+  },
+
+  "task": {
+    "type":             {"value": null, "default": "text_classification"},
+    "language":         {"value": null, "default": "de"},
+    "name":             {"value": null, "default": "GermEval18Coarse",          "desc": "GermEval18Coarse: binary offensive language detection."},
+    "do_eval":          {"value": null, "default": true,                        "desc": "Whether to run eval on the dev set."},
+    "do_train":         {"value": null, "default": true,                        "desc": "Whether to run training. Can be used to only evaluate on an already trained model."},
+
+    "processor_name":   {"value": null, "default": "TextClassificationProcessor","desc": "A Dataprocessor that is suited for tabular data. Needs special data parameters defined."},
+    "dev_split":        {"value": null, "default": 0.1,                         "desc": "Split a dev set from the training set using dev_split as proportion."},
+    "train_filename":   {"value": null, "default": "train.tsv",                 "desc": "Filename for training."},
+    "dev_filename":     {"value": null, "default": null,                        "desc": "Filename for development. Missing in case of GermEval2018."},
+    "test_filename":    {"value": null, "default": "test.tsv",                  "desc": "Filename for testing. It is the submission file from competition."},
+    "delimiter":        {"value": null, "default": "\t",                        "desc": "Filename for testing. It is the submission file from competition."},
+    "columns":          {"value": null, "default": ["text", "label", "unused"], "desc": "Columns specifying position of text and labels in data files."},
+    "label_list":       {"value": null, "default": ["OTHER", "OFFENSE"],        "desc": "List of possible labels."},
+    "metric":           {"value": null, "default": "f1_macro",                 "desc": "Metric used. The competition uses macro averaged f1 score."},
+    "label_column_name":{"value": null, "default": "coarse_label", "desc":"Name of field that the label comes from in datasource"},
+    "skiprows":         {"value": null, "default": null,          "desc":""}
+  },
+  "parameter": {
+    "model":            {"value": "bert-base-german-cased", "default": null,                     "desc": "Bert pre-trained model selected in the list: bert-base-uncased,  bert-large-uncased,  bert-base-cased,  bert-large-cased,  bert-base-multilingual-uncased,  bert-base-multilingual-cased,  bert-base-chinese."},
+    "prediction_head":  {"value": null,                     "default": "TextClassificationHead", "desc": "Kind of prediction head we use on top of Language Model"},
+    "lm_output_type":   {"value": null,                     "default": "per_sequence",           "desc": "Language Model output."},
+    "lower_case":       {"value": null,                     "default": false,                    "desc": "Set to true if you are using an uncased model."},
+    "max_seq_len":      {"value": null,                     "default": 150,                      "desc": "The maximum total input sequence length after WordPiece tokenization. 128 was too short for some texts"},
+    "balance_classes":  {"value": null,                     "default": true,                     "desc": "Balance classes using weighted CrossEntropyLoss. Original train set from GermEval18 is skewed and the final evaluation is macro averaged,  so we need to balance for optimal performance.."},
+
+    "epochs":                       {"value": null, "default": 2.0,      "desc": "Total number of training epochs to perform."},
+    "batch_size":                   {"value": null, "default": 48,       "desc": "Total batch size for training on a single V100 GPU."},
+    "gradient_accumulation_steps":  {"value": null, "default": 1,        "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."},
+    "embeds_dropout_prob":          {"value": null, "default": 0.1,      "desc": "Strength of dropout to be applied to the word embeddings generated by the language model."},
+    "layer_dims":                   {"value": null, "default": [768, 2], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
+  },
+  "optimizer": {
+    "learning_rate":      {"value": null, "default": 2e-5, "desc": "The initial learning rate for AdamW."},
+    "optimizer_opts":     {"value": null, "default": null, "desc": "Additional optimizer config."},
+    "schedule_opts":      {"value": null, "default":  {"name": "LinearWarmup", "warmup_proportion":  0.2}, "desc":  "opts for lr schedule"}
+ },
+  "logging": {
+    "eval_every":     {"value": null,                                "default": 30,   "desc": "Steps per training loop (batches) required for evaluation on dev set. Set to 0 when you do not want to do evaluation on dev set during training."},
+    "mlflow_url":     {"value": "https://public-mlflow.deepset.ai/", "default": null, "desc": "Mlflow server for tracking experiments (e.g. http://80.123.45.167:5000/)"},
+    "mlflow_nested":  {"value": null,                                "default": true, "desc": "Nesting mlflow experiments. For doing multiple runs across a set of hyperparameters."},
+
+    "mlflow_experiment": {"value": "Public_FARM",               "default": null, "desc": "Experiment name used for mlflow"},
+    "mlflow_run_name":   {"value": "germeval coarse by config", "default": null, "desc": "Name of the particular run for mlflow"}
+  }
+}
+
+
diff --git a/experiments/german-bert2.0-eval/germEval18Fine_config.json b/experiments/german-bert2.0-eval/germEval18Fine_config.json
new file mode 100644
index 000000000..213b61cc9
--- /dev/null
+++ b/experiments/german-bert2.0-eval/germEval18Fine_config.json
@@ -0,0 +1,62 @@
+{
+  "general": {
+    "cache_dir":  {"value": null, "default": "",                "desc": "Path for storing pre-trained models downloaded from s3."},
+    "data_dir":   {"value": null, "default": "data/germeval18", "desc": "Input directory for downstream task. Should contain train + test (+ dev) files."},
+    "output_dir": {"value": null, "default": "saved_models",    "desc": "Output directory where model predictions and checkpoints will be saved."},
+
+    "cuda":        {"value": null, "default": true,  "desc": "CUDA flag,  uses CUDA if available."},
+    "local_rank":  {"value": null, "default": -1,    "desc": "If local_rank == -1 -> multiGPU mode on one machine,  other values signal distributed computation across several nodes (apex install required)."},
+    "use_amp":     {"value": null, "default": null, "desc": "Automatic mixed precision with APEX. Must be set to null to disable or to any optimisation level (see apex documentation). 'O1' is recommended."},
+    "seed":         {"value": null, "default": 42, "desc": "Random seed for initializations."}
+  },
+
+  "task": {
+    "type":             {"value": null, "default": "text_classification"},
+    "language":         {"value": null, "default": "de"},
+    "name":             {"value": null, "default": "GermEval18Fine",             "desc": "GermEval18Fine: Finegrained multiclass offensive language detection, 4 classes."},
+    "do_eval":          {"value": null, "default": true,                         "desc": "Whether to run eval on the dev set."},
+    "do_train":         {"value": null, "default": true,                         "desc": "Whether to run training. Can be used to only evaluate on an already trained model."},
+
+    "processor_name":   {"value": null, "default": "TextClassificationProcessor","desc": "A Dataprocessor that is suited for tabular data. Needs special data parameters defined."},
+    "dev_split":        {"value": null, "default": 0.1,                          "desc": "Split a dev set from the training set using dev_split as proportion."},
+    "train_filename":   {"value": null, "default": "train.tsv",                  "desc": "Filename for training."},
+    "dev_filename":     {"value": null, "default": null,                         "desc": "Filename for development. Missing in case of GermEval2018."},
+    "test_filename":    {"value": null, "default": "test.tsv",                   "desc": "Filename for testing. It is the submission file from competition."},
+    "delimiter":        {"value": null, "default": "\t",                         "desc": "Filename for testing. It is the submission file from competition."},
+    "columns":          {"value": null, "default": ["text", "unused", "label"],  "desc": "Columns specifying position of text and labels in data files."},
+    "label_list":       {"value": null, "default": ["OTHER", "INSULT", "ABUSE", "PROFANITY"],"desc": "List of possible labels."},
+    "metric":           {"value": null, "default": "f1_macro",                 "desc": "Metric used. The competition uses macro averaged f1 score."},
+    "label_column_name":{"value": null, "default": "fine_label", "desc":"Name of field that the label comes from in datasource"},
+    "skiprows":         {"value": null, "default": null,          "desc":""}
+  },
+
+  "parameter": {
+    "model":            {"value": "bert-base-german-cased", "default": null,                     "desc": "Bert pre-trained model selected in the list: bert-base-uncased,  bert-large-uncased,  bert-base-cased,  bert-large-cased,  bert-base-multilingual-uncased,  bert-base-multilingual-cased,  bert-base-chinese."},
+    "prediction_head":  {"value":  null,                    "default": "TextClassificationHead", "desc":  "Kind of prediction head we use on top of Language Model"},
+    "lm_output_type":   {"value":  null,                    "default": "per_sequence",           "desc":  "Language Model output."},
+    "lower_case":       {"value": null,                     "default": false,                    "desc": "Set to true if you are using an uncased model."},
+    "max_seq_len":      {"value": null,                     "default": 150,                      "desc": "The maximum total input sequence length after WordPiece tokenization. 128 was too short for some texts"},
+    "balance_classes":  {"value": null,                     "default": true,                     "desc": "Balance classes using weighted CrossEntropyLoss. Original train set from GermEval18 is skewed and the final evaluation is macro averaged,  so we need to balance for optimal performance.."},
+
+    "epochs":                       {"value": null, "default": 3.0,      "desc": "Total number of training epochs to perform."},
+    "batch_size":                   {"value": null, "default": 48,       "desc": "Total batch size for training on a single V100 GPU."},
+    "gradient_accumulation_steps":  {"value": null, "default": 1,        "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."},
+    "embeds_dropout_prob":          {"value": null, "default": 0.1,      "desc": "Strength of dropout to be applied to the word embeddings generated by the language model."},
+    "layer_dims":                   {"value": null, "default": [768, 4], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
+  },
+  "optimizer": {
+    "learning_rate":      {"value": null, "default": 5e-5, "desc": "The initial learning rate for AdamW."},
+    "optimizer_opts":     {"value": null, "default": null, "desc": "Additional optimizer config."},
+    "schedule_opts":      {"value": null, "default":  {"name": "LinearWarmup", "warmup_proportion":  0.2}, "desc":  "opts for lr schedule"}
+ },
+  "logging": {
+    "eval_every":     {"value": null,                                "default": 30,   "desc": "Steps per training loop (batches) required for evaluation on dev set. Set to 0 when you do not want to do evaluation on dev set during training."},
+    "mlflow_url":     {"value": "https://public-mlflow.deepset.ai/", "default": null, "desc": "Mlflow server for tracking experiments (e.g. http://80.123.45.167:5000/)"},
+    "mlflow_nested":  {"value": null,                                "default": true, "desc": "Nesting mlflow experiments. For doing multiple runs across a set of hyperparameters."},
+
+    "mlflow_experiment": {"value": "Public_FARM",             "default": null, "desc": "Experiment name used for mlflow"},
+    "mlflow_run_name":   {"value": "germeval fine by config", "default": null, "desc": "Name of the particular run for mlflow"}
+  }
+}
+
+
diff --git a/experiments/ner/conll2003_de_config.json b/experiments/ner/conll2003_de_config.json
index 3d1e538c5..0c563b906 100644
--- a/experiments/ner/conll2003_de_config.json
+++ b/experiments/ner/conll2003_de_config.json
@@ -34,7 +34,7 @@
     "max_seq_len":     {"value": null, "default": 128,                       "desc": "The maximum total input sequence length after WordPiece tokenization. Some GNAD texts even extend beyond 512 tokens."},
     "balance_classes": {"value": null, "default": false,                     "desc": "Balance classes using weighted CrossEntropyLoss."},
     "epochs":                       {"value": null, "default": 2,         "desc": "Total number of training epochs to perform."},
-    "batch_size":                   {"value": null, "default": 64,        "desc": "Total batch size for training on one V100 GPU. If using multiGPU, the total batch size will be automatically adjusted."},
+    "batch_size":                   {"value": null, "default": 64,        "desc": "Total batch size for training on a single V100 GPU."},
     "gradient_accumulation_steps":  {"value": null, "default": 1,         "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."},
     "embeds_dropout_prob":          {"value": null, "default": 0.1,       "desc": "Strength of dropout to be applied to the word embeddings generated by the language model."},
     "layer_dims":                   {"value": null, "default": [768, 15], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
diff --git a/experiments/ner/conll2003_en_config.json b/experiments/ner/conll2003_en_config.json
index 7dd4c0364..1f5282a33 100644
--- a/experiments/ner/conll2003_en_config.json
+++ b/experiments/ner/conll2003_en_config.json
@@ -36,7 +36,7 @@
     "balance_classes": {"value": null, "default": false,                     "desc": "Balance classes using weighted CrossEntropyLoss."},
 
     "epochs":                       {"value": null, "default": 2,         "desc": "Total number of training epochs to perform."},
-    "batch_size":                   {"value": null, "default": 64,        "desc": "Total batch size for training on one V100 GPU. If using multiGPU, the total batch size will be automatically adjusted."},
+    "batch_size":                   {"value": null, "default": 64,        "desc": "Total batch size for training on a single V100 GPU."},
     "gradient_accumulation_steps":  {"value": null, "default": 1,         "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."},
     "embeds_dropout_prob":          {"value": null, "default": 0.1,       "desc": "Strength of dropout to be applied to the word embeddings generated by the language model."},
     "layer_dims":                   {"value": null, "default": [768, 15], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
diff --git a/experiments/ner/germEval14_config.json b/experiments/ner/germEval14_config.json
index 625f01c91..6aa05c750 100644
--- a/experiments/ner/germEval14_config.json
+++ b/experiments/ner/germEval14_config.json
@@ -18,9 +18,9 @@
     "do_train":       {"value": null, "default": true,         "desc": "Whether to run training. Can be used to only evaluate on an already trained model."},
 
     "processor_name":   {"value": null, "default": "NERProcessor",   "desc": "A Dataprocessor that is suited for tabular data. Needs special data parameters defined."},
-    "dev_split":        {"value": null, "default": 0.0,              "desc": "Split a dev set from the training set using dev_split as proportion."},
+    "dev_split":        {"value": null, "default": 0.1,              "desc": "Split a dev set from the training set using dev_split as proportion."},
     "train_filename":   {"value": null, "default": "train.txt",      "desc": "Filename for training."},
-    "dev_filename":     {"value": null, "default": "dev.txt",        "desc": "Filename for development. Missing in case of GermEval2018."},
+    "dev_filename":     {"value": null, "default": null,        "desc": "Filename for development. Missing in case of GermEval2018."},
     "test_filename":    {"value": null, "default": "test.txt",       "desc": "Filename for testing. It is the submission file from competition."},
     "delimiter":        {"value": null, "default": " ",              "desc": "Delimiter used to seprate columns in input data."},
     "label_list":       {"value": null, "default": ["[PAD]", "X", "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-OTH", "I-OTH"], "desc": ""},
@@ -36,7 +36,7 @@
     "balance_classes": {"value": null,                     "default": false,                     "desc": "Balance classes using weighted CrossEntropyLoss."},
 
     "epochs":                      {"value": null, "default": 4,         "desc": "Total number of training epochs to perform."},
-    "batch_size":                  {"value": null, "default": 64,        "desc": "Total batch size for training on one V100 GPU. If using multiGPU, the total batch size will be automatically adjusted."},
+    "batch_size":                  {"value": null, "default": 64,        "desc": "Total batch size for training on a single V100 GPU."},
     "gradient_accumulation_steps": {"value": null, "default": 1,         "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."},
     "embeds_dropout_prob":         {"value": null, "default": 0.1,       "desc": "Strength of dropout to be applied to the word embeddings generated by the language model."},
     "layer_dims":                  {"value": null, "default": [768, 15], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
diff --git a/experiments/text_classification/cola_config.json b/experiments/text_classification/cola_config.json
index 140908991..72aca0bc7 100644
--- a/experiments/text_classification/cola_config.json
+++ b/experiments/text_classification/cola_config.json
@@ -37,7 +37,7 @@
     "max_seq_len":      {"value": null,                     "default": 64,                       "desc": "The maximum total input sequence length after WordPiece tokenization. Some GNAD texts even extend beyond 512 tokens."},
     "balance_classes":  {"value": null,                     "default": false,                     "desc": "Balance classes using weighted CrossEntropyLoss."},
     "epochs":                       {"value": null, "default": 2,        "desc": "Total number of training epochs to perform."},
-    "batch_size":                   {"value": null, "default": 100,        "desc": "Total batch size for training for single GPU v100. Only low values possible because of large sequence length. If using multiGPU, the total batch size will be automatically adjusted."},
+    "batch_size":                   {"value": null, "default": 100,        "desc": ""},
     "gradient_accumulation_steps":  {"value": null, "default": 1,        "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."},
     "embeds_dropout_prob":          {"value": null, "default": 0.1,      "desc": "Strength of dropout to be applied to the word embeddings generated by the language model."},
     "layer_dims":                   {"value": null, "default": [768, 2], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
diff --git a/experiments/text_classification/germEval18Coarse_config.json b/experiments/text_classification/germEval18Coarse_config.json
index 8a80f0a67..daa0e465b 100644
--- a/experiments/text_classification/germEval18Coarse_config.json
+++ b/experiments/text_classification/germEval18Coarse_config.json
@@ -38,7 +38,7 @@
     "balance_classes":  {"value": null,                     "default": true,                     "desc": "Balance classes using weighted CrossEntropyLoss. Original train set from GermEval18 is skewed and the final evaluation is macro averaged,  so we need to balance for optimal performance.."},
 
     "epochs":                       {"value": null, "default": 2.0,      "desc": "Total number of training epochs to perform."},
-    "batch_size":                   {"value": null, "default": 48,       "desc": "Total batch size for training for single GPU v100. If using multiGPU, the total batch size will be automatically adjusted."},
+    "batch_size":                   {"value": null, "default": 48,       "desc": "Total batch size for training on a single V100 GPU."},
     "gradient_accumulation_steps":  {"value": null, "default": 1,        "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."},
     "embeds_dropout_prob":          {"value": null, "default": 0.1,      "desc": "Strength of dropout to be applied to the word embeddings generated by the language model."},
     "layer_dims":                   {"value": null, "default": [768, 2], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
diff --git a/experiments/text_classification/germEval18Fine_config.json b/experiments/text_classification/germEval18Fine_config.json
index 703ae1400..213b61cc9 100644
--- a/experiments/text_classification/germEval18Fine_config.json
+++ b/experiments/text_classification/germEval18Fine_config.json
@@ -39,7 +39,7 @@
     "balance_classes":  {"value": null,                     "default": true,                     "desc": "Balance classes using weighted CrossEntropyLoss. Original train set from GermEval18 is skewed and the final evaluation is macro averaged,  so we need to balance for optimal performance.."},
 
     "epochs":                       {"value": null, "default": 3.0,      "desc": "Total number of training epochs to perform."},
-    "batch_size":                   {"value": null, "default": 48,       "desc": "Total batch size for training for single GPU v100. If using multiGPU, the total batch size will be automatically adjusted."},
+    "batch_size":                   {"value": null, "default": 48,       "desc": "Total batch size for training on a single V100 GPU."},
     "gradient_accumulation_steps":  {"value": null, "default": 1,        "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."},
     "embeds_dropout_prob":          {"value": null, "default": 0.1,      "desc": "Strength of dropout to be applied to the word embeddings generated by the language model."},
     "layer_dims":                   {"value": null, "default": [768, 4], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
diff --git a/experiments/text_classification/gnad_config.json b/experiments/text_classification/gnad_config.json
index 065eb7089..e3564cb4e 100644
--- a/experiments/text_classification/gnad_config.json
+++ b/experiments/text_classification/gnad_config.json
@@ -38,7 +38,7 @@
     "balance_classes":  {"value": null,                     "default": false,                     "desc": "Balance classes using weighted CrossEntropyLoss."},
 
     "epochs":                       {"value": null, "default": 2,        "desc": "Total number of training epochs to perform."},
-    "batch_size":                   {"value": null, "default": 8,        "desc": "Total batch size for training for single GPU v100. Only low values possible because of large sequence length. If using multiGPU, the total batch size will be automatically adjusted."},
+    "batch_size":                   {"value": null, "default": 8,        "desc": "Total batch size for training on a single V100 GPU.. Only low values possible because of large sequence length."},
     "gradient_accumulation_steps":  {"value": null, "default": 1,        "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."},
     "embeds_dropout_prob":          {"value": null, "default": 0.1,      "desc": "Strength of dropout to be applied to the word embeddings generated by the language model."},
     "layer_dims":                   {"value": null, "default": [768, 9], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
diff --git a/experiments/xlm_roberta_eval/conll2003_de_config.json b/experiments/xlm_roberta_eval/conll2003_de_config.json
index c1340a1f5..adfb22a4f 100644
--- a/experiments/xlm_roberta_eval/conll2003_de_config.json
+++ b/experiments/xlm_roberta_eval/conll2003_de_config.json
@@ -34,7 +34,7 @@
     "max_seq_len":     {"value": null, "default": 128,                       "desc": "The maximum total input sequence length after WordPiece tokenization. Some GNAD texts even extend beyond 512 tokens."},
     "balance_classes": {"value": null, "default": false,                     "desc": "Balance classes using weighted CrossEntropyLoss."},
     "epochs":                       {"value": 4, "default": 5,         "desc": "Total number of training epochs to perform."},
-    "batch_size":                   {"value": 32, "default": 32,        "desc": "Total batch size for training on one V100 GPU. If using multiGPU, the total batch size will be automatically adjusted."},
+    "batch_size":                   {"value": 32, "default": 32,        "desc": ""},
     "gradient_accumulation_steps":  {"value": null, "default": 1,         "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."},
     "embeds_dropout_prob":          {"value": null, "default": 0.1,       "desc": "Strength of dropout to be applied to the word embeddings generated by the language model."},
     "layer_dims":                   {"value": [1024, 15], "default": [768, 15], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
diff --git a/experiments/xlm_roberta_eval/germEval14_config.json b/experiments/xlm_roberta_eval/germEval14_config.json
index 4989d93cc..bdb7af1bc 100644
--- a/experiments/xlm_roberta_eval/germEval14_config.json
+++ b/experiments/xlm_roberta_eval/germEval14_config.json
@@ -36,7 +36,7 @@
     "balance_classes": {"value": null,                     "default": false,                     "desc": "Balance classes using weighted CrossEntropyLoss."},
 
     "epochs":                      {"value": 5, "default": 4,         "desc": "Total number of training epochs to perform."},
-    "batch_size":                  {"value": 8, "default": 64,        "desc": "Total batch size for training on one V100 GPU. If using multiGPU, the total batch size will be automatically adjusted."},
+    "batch_size":                  {"value": 8, "default": 64,        "desc": ""},
     "gradient_accumulation_steps": {"value": null, "default": 1,         "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."},
     "embeds_dropout_prob":         {"value": null, "default": 0.1,       "desc": "Strength of dropout to be applied to the word embeddings generated by the language model."},
     "layer_dims":                  {"value": [1024, 15], "default": [768, 15], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
diff --git a/experiments/xlm_roberta_eval/germEval18Coarse_config.json b/experiments/xlm_roberta_eval/germEval18Coarse_config.json
index 56891de36..d64a9663a 100644
--- a/experiments/xlm_roberta_eval/germEval18Coarse_config.json
+++ b/experiments/xlm_roberta_eval/germEval18Coarse_config.json
@@ -38,7 +38,7 @@
     "balance_classes":  {"value": null,                     "default": true,                     "desc": "Balance classes using weighted CrossEntropyLoss. Original train set from GermEval18 is skewed and the final evaluation is macro averaged,  so we need to balance for optimal performance.."},
 
     "epochs":                       {"value": 10, "default": 2.0,      "desc": "Total number of training epochs to perform."},
-    "batch_size":                   {"value": 8, "default": 48,       "desc": "Total batch size for training for single GPU v100. If using multiGPU, the total batch size will be automatically adjusted."},
+    "batch_size":                   {"value": 8, "default": 48,       "desc": ""},
     "gradient_accumulation_steps":  {"value": null, "default": 1,        "desc": "Number of updates steps (batches) to accumulate before performing a backward/update pass."},
     "embeds_dropout_prob":          {"value": null, "default": 0.1,      "desc": "Strength of dropout to be applied to the word embeddings generated by the language model."},
     "layer_dims":                   {"value": [1024,2], "default": [768, 2], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
diff --git a/farm/experiment.py b/farm/experiment.py
index af2d86ff5..7f699939f 100644
--- a/farm/experiment.py
+++ b/farm/experiment.py
@@ -23,8 +23,8 @@
 
 
 def load_experiments(file):
-    args = read_config(file, flattend=False)
-    experiments = unnestConfig(args, flattened=False)
+    args = read_config(file)
+    experiments = unnestConfig(args)
     return experiments
 
 
@@ -54,8 +54,7 @@ def run_experiment(args):
     args.parameter.batch_size = int(
         args.parameter.batch_size // args.parameter.gradient_accumulation_steps
     )
-    # if n_gpu > 1:
-    #     args.parameter.batch_size = args.parameter.batch_size * n_gpu
+
     set_all_seeds(args.general.seed)
 
     # Prepare Data
@@ -94,8 +93,8 @@ def run_experiment(args):
     )
 
     # Init optimizer
-    optimizer_opts = dict(args.optimizer.optimizer_opts) if args.optimizer.optimizer_opts else None
-    schedule_opts = dict(args.optimizer.schedule_opts) if args.optimizer.schedule_opts else None
+    optimizer_opts = args.optimizer.optimizer_opts.toDict() if args.optimizer.optimizer_opts else None
+    schedule_opts = args.optimizer.schedule_opts.toDict() if args.optimizer.schedule_opts else None
     model, optimizer, lr_schedule = initialize_optimizer(
         model=model,
         learning_rate=args.optimizer.learning_rate,
diff --git a/farm/file_utils.py b/farm/file_utils.py
index 03841c52e..11b22ac85 100644
--- a/farm/file_utils.py
+++ b/farm/file_utils.py
@@ -282,82 +282,54 @@ def get_file_extension(path, dot=True, lower=True):
     return ext.lower() if lower else ext
 
 
-def read_config(path, flattend=False):
+def read_config(path):
     if path:
         with open(path) as json_data_file:
             conf_args = json.load(json_data_file)
     else:
         raise ValueError("No config provided for classifier")
 
-    def getArgValue(arg):
-        if "value" not in arg:
-            logger.error(
-                "Only depth 2 config files supported. Failed to convert: %s" % str(arg)
-            )
-        return arg["value"] if (arg["value"] is not None) else arg["default"]
-
     # flatten last part of config, take either value or default as value
     for gk, gv in conf_args.items():
         for k, v in gv.items():
-            if isinstance(getArgValue(v), dict):
-                logger.error("Config is too deeply nested, at %s" % str(v))
-            conf_args[gk][k] = getArgValue(v)
+            conf_args[gk][k] = v["value"] if (v["value"] is not None) else v["default"]
 
     # DotMap for making nested dictionary accessible through dot notation
-    flat_args = dict(
-        conf_args["general"],
-        **conf_args["task"],
-        **conf_args["parameter"],
-        **conf_args["logging"],
-    )
-    if flattend:
-        args = DotMap(flat_args, _dynamic=False)
-    else:
-        args = DotMap(conf_args, _dynamic=False)
+    args = DotMap(conf_args, _dynamic=False)
 
     return args
 
 
-def unnestConfig(config, flattened=False):
+def unnestConfig(config):
     """
     This function creates a list of config files for evaluating parameters with different values. If a config parameter
     is of type list this list is iterated over and a config object without lists is returned. Can handle lists inside any
     number of parameters.
 
-    Can handle shallow or nested (one level) configs
+    Can handle nested (one level) configs
     """
     nestedKeys = []
     nestedVals = []
-    if flattened:
-        for k, v in config.items():
-            if isinstance(v, list):
-                if k != "layer_dims":  # exclude layer dims, since it is already a list
-                    nestedKeys.append(k)
-                    nestedVals.append(v)
-    else:
-        for gk, gv in config.items():
-            if(gk != "task"):
-                for k, v in gv.items():
-                    if isinstance(v, list):
-                        if isinstance(v, list):
-                            if (
-                                k != "layer_dims"
-                            ):  # exclude layer dims, since it is already a list
-                                nestedKeys.append([gk, k])
-                                nestedVals.append(v)
-                        elif isinstance(v, dict):
-                            logger.error("Config too deep!")
+
+    for gk, gv in config.items():
+        if(gk != "task"):
+            for k, v in gv.items():
+                if isinstance(v, list):
+                    if (
+                        k != "layer_dims"
+                    ):  # exclude layer dims, since it is already a list
+                        nestedKeys.append([gk, k])
+                        nestedVals.append(v)
+                elif isinstance(v, dict):
+                    logger.warning("Config too deep! Working on %s" %(str(v)))
 
     if len(nestedKeys) == 0:
         unnestedConfig = [config]
     else:
-        if flattened:
-            logger.info("Nested config at parameters: %s" % (", ".join(nestedKeys)))
-        else:
-            logger.info(
-                "Nested config at parameters: %s"
-                % (", ".join(".".join(x) for x in nestedKeys))
-            )
+        logger.info(
+            "Nested config at parameters: %s"
+            % (", ".join(".".join(x) for x in nestedKeys))
+        )
         unnestedConfig = []
         mesh = np.meshgrid(
             *nestedVals
@@ -376,7 +348,7 @@ def unnestConfig(config, flattened=False):
                 elif len(k) == 2:
                     tempconfig[k[0]][k[1]] = mesh[j][i]  # set nested dictionary keys
                 else:
-                    logger.error("Config too deep!")
+                    logger.warning("Config too deep! Working on %s" %(str(k)))
             unnestedConfig.append(tempconfig)
 
     return unnestedConfig

From 03ceab0bb00864a0f181ca11d26ac7fbee9761e8 Mon Sep 17 00:00:00 2001
From: timoeller <timo.moeller@deepset.ai>
Date: Tue, 18 Feb 2020 15:03:56 +0100
Subject: [PATCH 2/2] Adjust configs

---
 experiments/german-bert2.0-eval/germEval18Coarse_config.json | 2 +-
 experiments/german-bert2.0-eval/germEval18Fine_config.json   | 4 ++--
 experiments/text_classification/germEval18Coarse_config.json | 2 +-
 experiments/text_classification/germEval18Fine_config.json   | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/experiments/german-bert2.0-eval/germEval18Coarse_config.json b/experiments/german-bert2.0-eval/germEval18Coarse_config.json
index daa0e465b..8be6befac 100644
--- a/experiments/german-bert2.0-eval/germEval18Coarse_config.json
+++ b/experiments/german-bert2.0-eval/germEval18Coarse_config.json
@@ -49,7 +49,7 @@
     "schedule_opts":      {"value": null, "default":  {"name": "LinearWarmup", "warmup_proportion":  0.2}, "desc":  "opts for lr schedule"}
  },
   "logging": {
-    "eval_every":     {"value": null,                                "default": 30,   "desc": "Steps per training loop (batches) required for evaluation on dev set. Set to 0 when you do not want to do evaluation on dev set during training."},
+    "eval_every":     {"value": null,                                "default": 50,   "desc": "Steps per training loop (batches) required for evaluation on dev set. Set to 0 when you do not want to do evaluation on dev set during training."},
     "mlflow_url":     {"value": "https://public-mlflow.deepset.ai/", "default": null, "desc": "Mlflow server for tracking experiments (e.g. http://80.123.45.167:5000/)"},
     "mlflow_nested":  {"value": null,                                "default": true, "desc": "Nesting mlflow experiments. For doing multiple runs across a set of hyperparameters."},
 
diff --git a/experiments/german-bert2.0-eval/germEval18Fine_config.json b/experiments/german-bert2.0-eval/germEval18Fine_config.json
index 213b61cc9..719da244b 100644
--- a/experiments/german-bert2.0-eval/germEval18Fine_config.json
+++ b/experiments/german-bert2.0-eval/germEval18Fine_config.json
@@ -45,12 +45,12 @@
     "layer_dims":                   {"value": null, "default": [768, 4], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
   },
   "optimizer": {
-    "learning_rate":      {"value": null, "default": 5e-5, "desc": "The initial learning rate for AdamW."},
+    "learning_rate":      {"value": null, "default": 2e-5, "desc": "The initial learning rate for AdamW."},
     "optimizer_opts":     {"value": null, "default": null, "desc": "Additional optimizer config."},
     "schedule_opts":      {"value": null, "default":  {"name": "LinearWarmup", "warmup_proportion":  0.2}, "desc":  "opts for lr schedule"}
  },
   "logging": {
-    "eval_every":     {"value": null,                                "default": 30,   "desc": "Steps per training loop (batches) required for evaluation on dev set. Set to 0 when you do not want to do evaluation on dev set during training."},
+    "eval_every":     {"value": null,                                "default": 50,   "desc": "Steps per training loop (batches) required for evaluation on dev set. Set to 0 when you do not want to do evaluation on dev set during training."},
     "mlflow_url":     {"value": "https://public-mlflow.deepset.ai/", "default": null, "desc": "Mlflow server for tracking experiments (e.g. http://80.123.45.167:5000/)"},
     "mlflow_nested":  {"value": null,                                "default": true, "desc": "Nesting mlflow experiments. For doing multiple runs across a set of hyperparameters."},
 
diff --git a/experiments/text_classification/germEval18Coarse_config.json b/experiments/text_classification/germEval18Coarse_config.json
index daa0e465b..8be6befac 100644
--- a/experiments/text_classification/germEval18Coarse_config.json
+++ b/experiments/text_classification/germEval18Coarse_config.json
@@ -49,7 +49,7 @@
     "schedule_opts":      {"value": null, "default":  {"name": "LinearWarmup", "warmup_proportion":  0.2}, "desc":  "opts for lr schedule"}
  },
   "logging": {
-    "eval_every":     {"value": null,                                "default": 30,   "desc": "Steps per training loop (batches) required for evaluation on dev set. Set to 0 when you do not want to do evaluation on dev set during training."},
+    "eval_every":     {"value": null,                                "default": 50,   "desc": "Steps per training loop (batches) required for evaluation on dev set. Set to 0 when you do not want to do evaluation on dev set during training."},
     "mlflow_url":     {"value": "https://public-mlflow.deepset.ai/", "default": null, "desc": "Mlflow server for tracking experiments (e.g. http://80.123.45.167:5000/)"},
     "mlflow_nested":  {"value": null,                                "default": true, "desc": "Nesting mlflow experiments. For doing multiple runs across a set of hyperparameters."},
 
diff --git a/experiments/text_classification/germEval18Fine_config.json b/experiments/text_classification/germEval18Fine_config.json
index 213b61cc9..719da244b 100644
--- a/experiments/text_classification/germEval18Fine_config.json
+++ b/experiments/text_classification/germEval18Fine_config.json
@@ -45,12 +45,12 @@
     "layer_dims":                   {"value": null, "default": [768, 4], "desc": "Cannot do experiments on this value, since it is already a list. Dimensions of the prediction head. Needs to be of type String, otherwise it gets iterated over."}
   },
   "optimizer": {
-    "learning_rate":      {"value": null, "default": 5e-5, "desc": "The initial learning rate for AdamW."},
+    "learning_rate":      {"value": null, "default": 2e-5, "desc": "The initial learning rate for AdamW."},
     "optimizer_opts":     {"value": null, "default": null, "desc": "Additional optimizer config."},
     "schedule_opts":      {"value": null, "default":  {"name": "LinearWarmup", "warmup_proportion":  0.2}, "desc":  "opts for lr schedule"}
  },
   "logging": {
-    "eval_every":     {"value": null,                                "default": 30,   "desc": "Steps per training loop (batches) required for evaluation on dev set. Set to 0 when you do not want to do evaluation on dev set during training."},
+    "eval_every":     {"value": null,                                "default": 50,   "desc": "Steps per training loop (batches) required for evaluation on dev set. Set to 0 when you do not want to do evaluation on dev set during training."},
     "mlflow_url":     {"value": "https://public-mlflow.deepset.ai/", "default": null, "desc": "Mlflow server for tracking experiments (e.g. http://80.123.45.167:5000/)"},
     "mlflow_nested":  {"value": null,                                "default": true, "desc": "Nesting mlflow experiments. For doing multiple runs across a set of hyperparameters."},