IBM · vmarois · Nov 13, 2018 · Nov 6, 2018 · Nov 7, 2018 · Nov 7, 2018
diff --git a/README.md b/README.md
@@ -55,6 +55,7 @@ The dependencies of MI-prometheus are:
    * torchtext
    * tensorboardx
    * matplotlib 
+   * psutil (enables grid-* to span child processes on MacOS and Ubuntu)
    * PyYAML
    * tqdm
    * nltk

diff --git a/configs/example_trainer_gpu.yaml b/configs/example_trainer_gpu.yaml
diff --git a/configs/vision/alexnet_mnist.yaml b/configs/vision/alexnet_mnist.yaml
@@ -17,23 +17,23 @@ training:
         lr: 0.01
     # settings parameters
     terminal_conditions:
-        loss_stop: 1.0e-5
+        loss_stop: 1.0e-3
         episode_limit: 50000
         epochs_limit: 10
 
 # Problem parameters:
 validation:
     problem:
         name: *name
-        batch_size: 64
+        batch_size: *b
         use_train_data: True  # True because we are splitting the training set to: validation and training
         resize: [224, 224]
 
 # Problem parameters:
 testing:
     problem:
         name: *name
-        batch_size: 64
+        batch_size: *b
         use_train_data: False
         resize: [224, 224]
 

diff --git a/configs/vision/grid_trainer_mnist.yaml b/configs/vision/grid_trainer_mnist.yaml
@@ -0,0 +1,43 @@
+grid_tasks:
+    -
+        default_configs: configs/vision/lenet5_mnist.yaml
+    -
+        default_configs: configs/vision/simplecnn_mnist.yaml
+
+# Set exactly the same experiment conditions for the 2 tasks.
+grid_overwrite:
+    training:
+        problem:
+            batch_size: &b 1000
+        sampler:
+            name: SubsetRandomSampler
+            indices: [0, 55000]
+       # Set the same optimizer parameters.
+        optimizer:
+            name: Adam
+            lr: 0.01
+        # Set the same terminal conditions.
+        terminal_conditions:
+            loss_stop: 4.0e-2
+            episode_limit: 10000
+            epoch_limit: 10
+
+    # Problem parameters:
+    validation:
+        problem:
+            batch_size: *b
+        sampler:
+            name: SubsetRandomSampler
+            indices: [55000, 60000]
+
+    testing:
+        problem:
+            batch_size: *b
+
+grid_settings:
+    # Set number of repetitions of each experiments.
+    experiment_repetitions: 5
+    # Set number of concurrent running experiments.
+    max_concurrent_runs: 4
+    # Set trainer.
+    trainer: mip-online-trainer
diff --git a/configs/vision/lenet5_mnist.yaml b/configs/vision/lenet5_mnist.yaml
@@ -2,9 +2,9 @@
 training:
   problem:
     name: &name MNIST
-    batch_size: 64
+    batch_size: &b 64
     use_train_data: True
-    mnist_folder: &folder '~/data/mnist'
+    data_folder: &folder '~/data/mnist'
     resize: [32, 32]
   # Use sampler that operates on a subset.
   sampler:
@@ -15,19 +15,19 @@ training:
     name: Adam
     lr: 0.01
   # settings parameters
-  #terminal_condition:
-  #  loss_stop: 1.0e-5
-  #  episode_limit: 10000
-  #  epoch_limit: 10
+  terminal_conditions:
+    loss_stop: 1.0e-2
+    episode_limit: 10000
+    epoch_limit: 10
 
 # Validation parameters:
 validation:
-  partial_validation_interval: 100
+  #partial_validation_interval: 100
   problem:
     name: *name 
-    batch_size: 64
-    use_train_data: True
-    mnist_folder: *folder
+    batch_size: *b
+    use_train_data: True  # True because we are splitting the training set to: validation and training
+    data_folder: *folder
     resize: [32, 32]
   # Use sampler that operates on a subset.
   sampler:
@@ -38,9 +38,9 @@ validation:
 testing:
   problem:
     name: *name
-    batch_size: 10000
+    batch_size: *b
     use_train_data: False
-    mnist_folder: *folder
+    data_folder: *folder
     resize: [32, 32]
 
 # Model parameters:

diff --git a/configs/vision/simplecnn_mnist.yaml b/configs/vision/simplecnn_mnist.yaml
@@ -5,7 +5,7 @@ training:
     problem:
         name: &name MNIST
         batch_size: &b 64
-        mnist_folder: &folder '~/data/mnist'
+        data_folder: &folder '~/data/mnist'
         use_train_data: True
         resize: [32, 32]
     sampler:
@@ -18,16 +18,16 @@ training:
         lr: 0.01
     # settings parameters
     terminal_conditions:
-        loss_stop: 1.0e-5
-        episode_limit: 20000
+        loss_stop: 1.0e-3
+        episode_limit: 1000
         epoch_limit: 1
 
 # Problem parameters:
 validation:
     problem:
         name: *name
-        batch_size: 5000
-        mnist_folder: *folder
+        batch_size: *b
+        data_folder: *folder
         use_train_data: True  # True because we are splitting the training set to: validation and training
         resize: [32, 32]
     sampler:
@@ -43,8 +43,8 @@ testing:
     #seed_torch: 2452
     problem:
         name: *name
-        batch_size: 10000
-        mnist_folder: *folder
+        batch_size: *b
+        data_folder: *folder
         use_train_data: False
         resize: [32, 32]
 

diff --git a/doc_build.sh b/doc_build.sh
@@ -8,4 +8,7 @@ sphinx-build -b html source build
 make html
 
 # open web browser(s) to master table of content
-firefox build/index.html
+if which firefox
+then
+    firefox build/index.html
+fi