diff --git a/.gitignore b/.gitignore index e0a583ec..3ab04e2f 100644 --- a/.gitignore +++ b/.gitignore @@ -93,6 +93,7 @@ ENV/ env.bak/ venv.bak/ *.vscode +condaenv.* # Spyder project settings .spyderproject diff --git a/diabetes_regression/ci_dependencies.yml b/diabetes_regression/ci_dependencies.yml new file mode 100644 index 00000000..a61731c4 --- /dev/null +++ b/diabetes_regression/ci_dependencies.yml @@ -0,0 +1,28 @@ +name: mlopspython_ci + +dependencies: + + # The python interpreter version. +- python=3.7.5 + +- r=3.6.0 +- r-essentials=3.6.0 +- numpy=1.18.1 +- pandas=1.0.0 +- scikit-learn=0.22.1 + +- pip=20.0.2 +- pip: + + # dependencies shared with other environment .yml files. + - azureml-sdk==1.0.79 + + # Additional pip dependencies for the CI environment. + - pytest==5.3.1 + - pytest-cov==2.8.1 + - requests==2.22.0 + - python-dotenv==0.10.3 + - flake8==3.7.9 + - flake8_formatter_junit_xml==0.0.6 + - azure-cli==2.0.77 + - tox==3.14.3 diff --git a/diabetes_regression/scoring/inference_config.yml b/diabetes_regression/scoring/inference_config.yml index 3f65cf33..42947da8 100644 --- a/diabetes_regression/scoring/inference_config.yml +++ b/diabetes_regression/scoring/inference_config.yml @@ -1,9 +1,9 @@ entryScript: score.py runtime: python -condaFile: conda_dependencies.yml +condaFile: ../scoring_dependencies.yml extraDockerfileSteps: schemaFile: sourceDirectory: enableGpu: False baseImage: -baseImageRegistry: \ No newline at end of file +baseImageRegistry: diff --git a/diabetes_regression/scoring/conda_dependencies.yml b/diabetes_regression/scoring_dependencies.yml similarity index 71% rename from diabetes_regression/scoring/conda_dependencies.yml rename to diabetes_regression/scoring_dependencies.yml index c97a2722..26ce3622 100644 --- a/diabetes_regression/scoring/conda_dependencies.yml +++ b/diabetes_regression/scoring_dependencies.yml @@ -14,24 +14,23 @@ # This directive is stored in a comment to preserve the Conda file structure. # [AzureMlVersion] = 2 -name: project_environment +name: diabetes_scoring + dependencies: + # The python interpreter version. - # Currently Azure ML Workbench only supports 3.5.2 and later. - python=3.7.5 + # Required by azureml-defaults, installed separately through Conda to # get a prebuilt version and not require build tools for the install. - psutil=5.6 #latest +- numpy=1.18.1 +- pandas=1.0.0 +- scikit-learn=0.22.1 + +- pip=20.0.2 - pip: - # Required packages for AzureML execution, history, and data preparation. - - azureml-model-management-sdk==1.0.1b6.post1 - - azureml-sdk==1.0.74 - - scipy==1.3.1 - - scikit-learn==0.22 - - pandas==0.25.3 - - numpy==1.17.3 - - joblib==0.14.0 - - gunicorn==19.9.0 - - flask==1.1.1 - - inference-schema[numpy-support] + # You must list azureml-defaults as a pip dependency + - azureml-defaults==1.0.85 + - inference-schema[numpy-support]==1.0.1 diff --git a/diabetes_regression/training_dependencies.yml b/diabetes_regression/training_dependencies.yml new file mode 100644 index 00000000..4d7a42a7 --- /dev/null +++ b/diabetes_regression/training_dependencies.yml @@ -0,0 +1,17 @@ +name: diabetes_training + +dependencies: + + # The python interpreter version. +- python=3.7.5 + +- numpy=1.18.1 +- pandas=1.0.0 +- scikit-learn=0.22.1 +#- r-essentials +#- tensorflow +#- keras + +- pip=20.0.2 +- pip: + - azureml-core==1.0.79 diff --git a/docs/code_description.md b/docs/code_description.md index 5f323ad6..d69a6f30 100644 --- a/docs/code_description.md +++ b/docs/code_description.md @@ -2,9 +2,7 @@ ### Environment Setup -- `environment_setup/requirements.txt` : It consists of a list of python packages which are needed by the train.py to run successfully on host agent (locally). - -- `environment_setup/install_requirements.sh` : This script prepares the python environment i.e. install the Azure ML SDK and the packages specified in requirements.txt +- `environment_setup/install_requirements.sh` : This script prepares a local conda environment i.e. install the Azure ML SDK and the packages specified in environment definitions. - `environment_setup/iac-*.yml, arm-templates` : Infrastructure as Code piplines to create and delete required resources along with corresponding arm-templates. @@ -27,6 +25,12 @@ - `ml_service/pipelines/diabetes_regression_verify_train_pipeline.py` : determines whether the evaluate_model.py step of the training pipeline registered a new model. - `ml_service/util` : contains common utility functions used to build and publish an ML training pipeline. +### Environment Definitions + +- `diabetes_regression/training_dependencies.yml` : Conda environment definition for the training environment (Docker image in which train.py is run). +- `diabetes_regression/scoring_dependencies.yml` : Conda environment definition for the scoring environment (Docker image in which score.py is run). +- `diabetes_regression/ci_dependencies.yml` : Conda environment definition for the CI environment. + ### Code - `diabetes_regression/training/train.py` : a training step of an ML training pipeline. @@ -39,5 +43,4 @@ ### Scoring - `diabetes_regression/scoring/score.py` : a scoring script which is about to be packed into a Docker Image along with a model while being deployed to QA/Prod environment. -- `diabetes_regression/scoring/conda_dependencies.yml` : contains a list of dependencies required by score.py to be installed in a deployable Docker Image - `diabetes_regression/scoring/inference_config.yml`, deployment_config_aci.yml, deployment_config_aks.yml : configuration files for the [AML Model Deploy](https://marketplace.visualstudio.com/items?itemName=ms-air-aiagility.private-vss-services-azureml&ssr=false#overview) pipeline task for ACI and AKS deployment targets. diff --git a/docs/getting_started.md b/docs/getting_started.md index bbd20506..8b3167e4 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -171,7 +171,13 @@ Great, you now have the build pipeline set up which automatically triggers every **Note:** The build pipeline also supports building and publishing ML pipelines using R to train a model. This is enabled -by changing the `build-train-script` pipeline variable to either `diabetes_regression_build_train_pipeline_with_r.py`, or `diabetes_regression_build_train_pipeline_with_r_on_dbricks.py`. For pipeline training a model with R on Databricks you'll need +by changing the `build-train-script` pipeline variable to either of: +* `diabetes_regression_build_train_pipeline_with_r.py` to train a model +with R on Azure ML Compute. You will also need to add the +`r-essentials` Conda packages into `diabetes_regression/scoring_dependencies.yml` +and `diabetes_regression/training_dependencies.yml`. +* `diabetes_regression_build_train_pipeline_with_r_on_dbricks.py` +to train a model with R on Databricks. You will need to manually create a Databricks cluster and attach it to the ML Workspace as a compute (Values DB_CLUSTER_ID and DATABRICKS_COMPUTE_NAME variables should be specified). @@ -243,6 +249,7 @@ Make sure your webapp has the credentials to pull the image from the Azure Conta * You should edit the pipeline definition to remove unused stages. For example, if you are deploying to ACI and AKS, you should delete the unused `Deploy_Webapp` stage. * The sample pipeline generates a random value for a model hyperparameter (ridge regression [*alpha*](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html)) to generate 'interesting' charts when testing the sample. In a real application you should use fixed hyperparameter values. You can [tune hyperparameter values using Azure ML](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters), and manage their values in Azure DevOps Variable Groups. * You may wish to enable [manual approvals](https://docs.microsoft.com/en-us/azure/devops/pipelines/process/approvals) before the deployment stages. +* You can install additional Conda or pip packages by modifying the YAML environment configurations under the `diabetes_regression` directory. Make sure to use fixed version numbers for all packages to ensure reproducibility, and use the same versions across environments. * You can explore aspects of model observability in the solution, such as: * **Logging**: navigate to the Application Insights instance linked to the Azure ML Portal, then to the Logs (Analytics) pane. The following sample query correlates HTTP requests with custom logs diff --git a/environment_setup/Dockerfile b/environment_setup/Dockerfile index 5e7b7581..4137967a 100644 --- a/environment_setup/Dockerfile +++ b/environment_setup/Dockerfile @@ -4,11 +4,16 @@ LABEL org.label-schema.vendor = "Microsoft" \ org.label-schema.url = "https://hub.docker.com/r/microsoft/mlopspython" \ org.label-schema.vcs-url = "https://github.com/microsoft/MLOpsPython" - +COPY diabetes_regression/ci_dependencies.yml /setup/ -COPY environment_setup/requirements.txt /setup/ - -RUN apt-get update && apt-get install gcc -y && pip install --upgrade -r /setup/requirements.txt && \ - conda install -c r r-essentials +RUN conda env create -f /setup/ci_dependencies.yml -CMD ["python"] \ No newline at end of file +# activate environment +ENV PATH /usr/local/envs/mlopspython_ci/bin:$PATH +RUN /bin/bash -c "source activate mlopspython_ci" + +# Verify conda installation. +# This serves as workaround for https://github.com/conda/conda/issues/8537 (conda env create doesn't fail +# if pip installation fails, for example due to a wrong package version). +# The `az` command is not available if pip has not run (and installed azure-cli). +RUN az --version diff --git a/environment_setup/install_requirements.sh b/environment_setup/install_requirements.sh old mode 100644 new mode 100755 index 930514a6..989e8b1e --- a/environment_setup/install_requirements.sh +++ b/environment_setup/install_requirements.sh @@ -24,6 +24,8 @@ # ARISING IN ANY WAY OUT OF THE USE OF THE SOFTWARE CODE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +set -eux -python --version -pip install -r requirements.txt \ No newline at end of file +conda env create -f diabetes_regression/ci_dependencies.yml + +conda activate mlopspython_ci diff --git a/environment_setup/requirements.txt b/environment_setup/requirements.txt deleted file mode 100644 index babb1ddc..00000000 --- a/environment_setup/requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -pytest>=5.3 -pytest-cov>=2.8.1 -requests>=2.22 -numpy>=1.17 -pandas>=0.25 -scikit-learn>=0.21.3 -azureml-sdk>=1.0 -python-dotenv>=0.10.3 -flake8>=3.7 -flake8_formatter_junit_xml>=0.0.6 -tox>=3.14.3 -azure-cli==2.0.76 diff --git a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py index 66913420..b127d9ee 100644 --- a/ml_service/pipelines/diabetes_regression_build_train_pipeline.py +++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline.py @@ -28,14 +28,10 @@ def main(): print("aml_compute:") print(aml_compute) - run_config = RunConfiguration(conda_dependencies=CondaDependencies.create( - conda_packages=['numpy', 'pandas', - 'scikit-learn', 'tensorflow', 'keras'], - pip_packages=['azure', 'azureml-core', - 'azure-storage', - 'azure-storage-blob', - 'azureml-dataprep']) - ) + # Create a run configuration environment + conda_deps_file = "diabetes_regression/training_dependencies.yml" + conda_deps = CondaDependencies(conda_deps_file) + run_config = RunConfiguration(conda_dependencies=conda_deps) run_config.environment.docker.enabled = True config_envvar = {} if (e.collection_uri is not None and e.teamproject_name is not None): diff --git a/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py b/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py index 25537424..1c383d50 100644 --- a/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py +++ b/ml_service/pipelines/diabetes_regression_build_train_pipeline_with_r.py @@ -26,15 +26,11 @@ def main(): print("aml_compute:") print(aml_compute) - run_config = RunConfiguration(conda_dependencies=CondaDependencies.create( - conda_packages=['numpy', 'pandas', - 'scikit-learn', 'tensorflow', 'keras'], - pip_packages=['azure', 'azureml-core', - 'azure-storage', - 'azure-storage-blob']) - ) + # Create a run configuration environment + conda_deps_file = "diabetes_regression/training_dependencies.yml" + conda_deps = CondaDependencies(conda_deps_file) + run_config = RunConfiguration(conda_dependencies=conda_deps) run_config.environment.docker.enabled = True - run_config.environment.docker.base_image = "mcr.microsoft.com/mlops/python" train_step = PythonScriptStep( name="Train Model",