microsoft · marabout2015 · Dec 10, 2019
diff --git a/.ci/azure-pipelines-v2.yml b/.ci/azure-pipelines-v2.yml
@@ -12,12 +12,14 @@ resources:
 trigger:
   branches:
     include:
+    - master
     - mabou/instrument
 
 pr:
   autoCancel: true
   branches:
     include:
+    - master
     - mabou/instrument
 
 stages:

diff --git a/01_Training_Script.ipynb b/01_Training_Script.ipynb
@@ -440,7 +440,27 @@
    "metadata": {},
    "source": [
     "## Run the script to see that it works <a id='run'></a>\n",
-    "This should take around ten minutes."
+    "Set the effort expended to train the classifier."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": [
+     "parameters"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "estimators = 1000"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run the classifier script. This should take about 10 minutes."
    ]
   },
   {
@@ -451,7 +471,7 @@
    },
    "outputs": [],
    "source": [
-    "%run -t scripts/TrainClassifier.py --estimators 1000 --match 5 --ngrams 2 --min_child_samples 10 --save model"
+    "%run -t scripts/TrainClassifier.py --estimators $estimators --match 5 --ngrams 2 --min_child_samples 10 --save model"
    ]
   },
   {

diff --git a/03_Run_Locally.ipynb b/03_Run_Locally.ipynb
@@ -39,7 +39,7 @@
    "metadata": {},
    "source": [
     "## Azure subscription <a id='subscription'></a>\n",
-    "If you have multiple subscriptions select the subscription you want to use. You may supply either the subscription's name or the subscription's ID. If you want to run this in a different location that supports HyperDrive, you may enter the one you want to use. You can also set the name of the resource group in which this tutorial will add resources. *IMPORTANT NOTE:* The last notebook in this example will delete this resource group and all associated resources."
+    "If you have multiple subscriptions select the subscription you want to use. You may supply either the subscription's name or the subscription's ID. If you want to run this in a different location that supports HyperDrive, you may enter the one you want to use. You can also set the name of the resource group in which this tutorial will add resources. *IMPORTANT NOTE:* The last notebook in this example will delete this resource group and all associated resources. We also define the number of estimators to use for the local run."
    ]
   },
   {
@@ -55,7 +55,8 @@
     "subscription_name=\"YOUR_SUBSCRIPTION_NAME\"\n",
     "subscription_id=\"YOUR_SUBSCRIPTION_ID\"\n",
     "location=\"eastus\"\n",
-    "resource_group=\"hypetuning\""
+    "resource_group=\"hypetuning\"\n",
+    "estimators = 1000"
    ]
   },
   {
@@ -179,10 +180,10 @@
     "est = Estimator(source_directory=os.path.join('.', 'scripts'), \n",
     "                entry_script='TrainClassifier.py',\n",
     "                script_params={'--data-folder': os.path.abspath('.'),\n",
-    "                               '--estimators': '1000',\n",
-    "                               '--match': '5',\n",
-    "                               '--ngrams': '2',\n",
-    "                               '--min_child_samples': '10',\n",
+    "                               '--estimators': estimators,\n",
+    "                               '--match': 5,\n",
+    "                               '--ngrams': 2,\n",
+    "                               '--min_child_samples': 10,\n",
     "                               \"--save\": \"local_model\"},\n",
     "                compute_target='local',\n",
     "                conda_packages=['pandas==0.23.4',\n",

diff --git a/04_Hyperparameter_Random_Search.ipynb b/04_Hyperparameter_Random_Search.ipynb
@@ -202,7 +202,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This hyperparameter space specifies a grid of 9,360 unique configuration points (4 `ngrams` X 39 `match` X 30 `min_child_samples` X 2 `unweighted`). We control the resources used by the search through specifying a maximum number of configuration points to sample as `max_total_runs`."
+    "This hyperparameter space specifies a grid of 9,360 unique configuration points (4 `ngrams` X 39 `match` X 30 `min_child_samples` X 2 `unweighted`). We control the resources used by the search through specifying a maximum number of configuration points to sample as `max_total_runs`. We also define the number of estimators to use for each run."
    ]
   },
   {
@@ -215,7 +215,8 @@
    },
    "outputs": [],
    "source": [
-    "max_total_runs = 96"
+    "max_total_runs = 96\n",
+    "estimators = 1000"
    ]
   },
   {
@@ -270,7 +271,7 @@
     "estimator = Estimator(source_directory=os.path.join('.', 'scripts'),\n",
     "                      entry_script='TrainClassifier.py',\n",
     "                      script_params={'--data-folder': ds.as_mount(),\n",
-    "                                     '--estimators': 1000},\n",
+    "                                     '--estimators': estimators},\n",
     "                      compute_target=compute_target,\n",
     "                      conda_packages=['pandas==0.23.4',\n",
     "                                      'scikit-learn==0.21.3',\n",

diff --git a/05_Train_Best_Model.ipynb b/05_Train_Best_Model.ipynb
@@ -166,8 +166,8 @@
    },
    "outputs": [],
    "source": [
-    "model_estimators = 8 * int(best_parameters['--estimators'])\n",
-    "model_estimators"
+    "estimators = 8 * int(best_parameters['--estimators'])\n",
+    "estimators"
    ]
   },
   {
@@ -186,7 +186,7 @@
     "ds = ws.get_default_datastore()\n",
     "model_parameters = best_parameters.copy()\n",
     "model_parameters['--data-folder'] = ds.as_mount()\n",
-    "model_parameters['--estimators'] = model_estimators\n",
+    "model_parameters['--estimators'] = estimators\n",
     "model_parameters['--save'] = 'FAQ_ranker'\n",
     "pd.Series(model_parameters, name='Value').to_frame()"
    ]

diff --git a/07_Train_With_AML_Pipeline.ipynb b/07_Train_With_AML_Pipeline.ipynb
@@ -297,7 +297,7 @@
    "metadata": {},
    "source": [
     "## Create AML Pipeline Tuning Step <a id='aml_pipeline_tune_step'></a>\n",
-    "We create a HyperDrive step in the AML pipeline to perform a search for hyperparameters. The `tune_estimators` pipeline parameter that controls the number of estimators used in tuning deliberately has a low default value for the speed of pipeline testing. The `tune_steps_data` output pipeline data is only used to synchronize with the next pipeline step."
+    "We create a HyperDrive step in the AML pipeline to perform a search for hyperparameters. The `tune_estimators` pipeline parameter that controls the number of estimators used in tuning deliberately has a low default value for the speed of pipeline testing."
    ]
   },
   {
@@ -307,15 +307,13 @@
    "outputs": [],
    "source": [
     "tune_step_name=\"tune_model\"\n",
-    "tune_steps_data = PipelineData(\"tune_steps_data\", datastore=ds)\n",
     "tune_estimators = PipelineParameter(name=\"tune_estimators\", default_value=1)  # Set to 1000 when running the pipeline.\n",
     "tune_step = HyperDriveStep(\n",
     "    name=tune_step_name,\n",
     "    hyperdrive_config=hyperdrive_run_config,\n",
     "    estimator_entry_script_arguments=[\"--data-folder\", data_folder,\n",
     "                                      \"--estimators\", tune_estimators],\n",
     "    inputs=[data_folder],\n",
-    "    outputs=[tune_steps_data],\n",
     "    allow_reuse=False)"
    ]
   },
@@ -409,7 +407,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Creating PythonScript Step for AML pipeline to get the best run's hyperparameters. The `tune_steps_data` input pipeline data is only used to synchronize with the previous pipeline step."
+    "Creating PythonScript Step for AML pipeline to get the best run's hyperparameters."
    ]
   },
   {
@@ -433,18 +431,18 @@
     "    arguments=[\"--hd-step\", tune_step_name,\n",
     "               \"--output-steps-data\", bh_steps_data,\n",
     "               \"--hyperparameters\", bh_hyperparameters_file],\n",
-    "    inputs=[tune_steps_data],\n",
     "    outputs=[bh_steps_data],\n",
     "    runconfig=bh_run_config,\n",
-    "    allow_reuse=False)"
+    "    allow_reuse=False)\n",
+    "bh_step.run_after(tune_step)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
     "## Create AML Pipeline Best Model Step <a id='aml_pipeline_estimator_step'></a>\n",
-    "This step passes the hyperparameters file from the previous step to the training script to create the best model. The `best_estimators` pipeline parameter that controls the number of estimators used in getting the best model deliberately has a low default value for the speed of pipeline testing. The `bm_steps_data` output pipeline data is only used to synchronize with the next pipeline step."
+    "This step passes the hyperparameters file from the previous step to the training script to create the best model. The `best_estimators` pipeline parameter that controls the number of estimators used in getting the best model deliberately has a low default value for the speed of pipeline testing."
    ]
   },
   {
@@ -454,7 +452,6 @@
    "outputs": [],
    "source": [
     "bm_step_name=\"best_model\"\n",
-    "bm_steps_data = PipelineData(\"bm_steps_data\", datastore=ds)\n",
     "bm_estimators = PipelineParameter(name=\"best_estimators\", default_value=1)  # Set to 8000 when running the pipeline\n",
     "bm_estimator = Estimator(source_directory=os.path.join('.', 'scripts'),  # Use a new Estimator as a bug workaround\n",
     "                         entry_script='TrainClassifier.py',\n",
@@ -472,7 +469,6 @@
     "                                      \"--save\", model_name],\n",
     "    compute_target=compute_target,\n",
     "    inputs=[data_folder, bh_steps_data],\n",
-    "    outputs=[bm_steps_data],\n",
     "    allow_reuse=False)"
    ]
   },
@@ -537,7 +533,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Creating PythonScript Step for AML pipeline to register the best model. The `bm_steps_data` input pipeline data is only used to synchronize with the previous pipeline step."
+    "Creating PythonScript Step for AML pipeline to register the best model."
    ]
   },
   {
@@ -559,9 +555,9 @@
     "    arguments=[\"--es-step\", bm_step_name,\n",
     "               \"--outputs\", \"outputs\",\n",
     "               \"--model-name\", model_name],\n",
-    "    inputs=[bm_steps_data],\n",
     "    runconfig=rm_run_config,\n",
-    "    allow_reuse=False)"
+    "    allow_reuse=False)\n",
+    "rm_step.run_after(bm_step)"
    ]
   },
   {
@@ -800,16 +796,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
-  },
-  "pycharm": {
-   "stem_cell": {
-    "cell_type": "raw",
-    "source": [],
-    "metadata": {
-     "collapsed": false
-    }
-   }
+   "version": "3.6.7"
   }
  },
  "nbformat": 4,

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -8,6 +8,7 @@ trigger:
 
 variables:
 - group: AzureKeyVault
+  # estimators: 1
 
 jobs:
 - job: MLHyperparameterTuningJob
@@ -44,7 +45,7 @@ jobs:
       source /usr/share/miniconda/etc/profile.d/conda.sh
       conda activate MLHyperparameterTuning
       echo Executing 01_Training_Script.ipynb
-      papermill 01_Training_Script.ipynb 01_Training_Script_Output.ipynb --log-output --no-progress-bar -k python3 
+      papermill 01_Training_Script.ipynb 01_Training_Script_Output.ipynb --log-output --no-progress-bar -k python3 -p estimators 1
     displayName: '01_Training_Script.ipynb'
 
   - bash: |
@@ -58,21 +59,21 @@ jobs:
       source /usr/share/miniconda/etc/profile.d/conda.sh
       conda activate MLHyperparameterTuning
       echo Executing 03_Run_Locally.ipynb
-      papermill 03_Run_Locally.ipynb 03_Run_Locally_Output.ipynb --log-output --no-progress-bar -k python3 -p subscription_id $(subscriptionid) -p resource_group $(azurergname) 
+      papermill 03_Run_Locally.ipynb 03_Run_Locally_Output.ipynb --log-output --no-progress-bar -k python3 -p subscription_id $(subscriptionid) -p resource_group $(azurergname) -p estimators 1
     displayName: '03_Run_Locally.ipynb'
 
   - bash: |
       source /usr/share/miniconda/etc/profile.d/conda.sh
       conda activate MLHyperparameterTuning
       echo Executing 04_Hyperparameter_Random_Search.ipynb
-      papermill 04_Hyperparameter_Random_Search.ipynb 04_Hyperparameter_Random_Search_Output.ipynb --log-output --no-progress-bar -k python3 -p max_total_runs $(dsmaxruns)
+      papermill 04_Hyperparameter_Random_Search.ipynb 04_Hyperparameter_Random_Search_Output.ipynb --log-output --no-progress-bar -k python3 -p max_total_runs $(dsmaxruns) -p estimators 1
     displayName: '04_Hyperparameter_Random_Search.ipynb'
 
   - bash: |
       source /usr/share/miniconda/etc/profile.d/conda.sh
       conda activate MLHyperparameterTuning
       echo Executing 05_Train_Best_Model.ipynb
-      papermill 05_Train_Best_Model.ipynb 05_Train_Best_Model_Output.ipynb --log-output --no-progress-bar -k python3 
+      papermill 05_Train_Best_Model.ipynb 05_Train_Best_Model_Output.ipynb --log-output --no-progress-bar -k python3 -p estimators 1
     displayName: '05_Train_Best_Model.ipynb'
 
   - bash: |

diff --git a/environment.yml b/environment.yml
@@ -11,6 +11,6 @@ dependencies:
   - lightgbm==2.2.1
   - pip:
     - prompt_toolkit==2.0.9
-    - azure-cli==2.0.75
-    - azureml-sdk[notebooks]==1.0.69
+    - azure-cli==2.0.77
+    - azureml-sdk[notebooks]==1.0.76
     - git+https://github.com/Microsoft/StatisticsTracker