Merge branch 'main' into dependabot/github_actions/actions/checkout-4

microsoft · Aug 30, 2024 · 2bfb14a · 2bfb14a
2 parents c6e2eb6 + 0501bd4
commit 2bfb14a
Show file tree

Hide file tree

Showing 38 changed files with 968 additions and 107 deletions.
diff --git a/.github/workflows/hi-ml-pr.yml b/.github/workflows/hi-ml-pr.yml
@@ -226,7 +226,8 @@ jobs:
           pip install ${{ steps.download.outputs.package_filename }}
 
           # Set env vars so wheel is sent to azureml as a private package
-          echo "HIML_AZURE_WHEEL_FILENAME=${{ steps.download.outputs.package_filename }}" >> $GITHUB_ENV
+          # This had to be deprecated because the upload always uses SAS tokens encrypted with the now disabled storage account key
+          # echo "HIML_AZURE_WHEEL_FILENAME=${{ steps.download.outputs.package_filename }}" >> $GITHUB_ENV
 
           # Test with pytest
           make pytest_and_coverage

diff --git a/Makefile b/Makefile
@@ -21,9 +21,11 @@ endef
 
 ## Package management
 
-# pip upgrade
+# pip upgrade.
+# As of PIP version 24.1, conditions like ">1.8.*" are no longer supported, but pytorch lightning
+# in the version we are using is still using this syntax. So we need to restrict the pip versions.
 pip_upgrade:
-	python -m pip install --upgrade pip
+	python -m pip install --upgrade "pip<24.1"
 
 # pip upgrade and install build requirements
 pip_build: pip_upgrade
@@ -123,3 +125,10 @@ combine: pip_test
 		coverage html && \
 		coverage xml && \
 		pycobertura show --format text --output coverage.txt coverage.xml
+
+blobfuse:
+	setup/prepare_blobfuse_installation.sh
+	sudo apt-get install blobfuse fuse
+
+mount:
+	setup/mount_datastores.sh
diff --git a/create_and_lock_environment.sh b/create_and_lock_environment.sh
@@ -1,5 +1,17 @@
 #!/bin/bash
 
+# Read input file from argument 1, default to primary_deps.yml
+input_file="primary_deps.yml"
+output_file="environment.yml"
+if [ "$#" -gt 0 ]; then
+    input_file=$1
+    echo "Using input file: $input_file"
+fi
+if [ "$#" -gt 1 ]; then
+    output_file=$2
+    echo "Using output file: $output_file"
+fi
+
 os_name=$(uname)
 if [[ ! $os_name == *"Linux"* ]]; then
     echo "ERROR: cannot run environment locking in non-linux environment. Windows users can do this using WSL - https://docs.microsoft.com/en-us/windows/wsl/install"
@@ -9,7 +21,7 @@ else
 fi
 
 # get environment name from primary dependencies YAML file
-name_line="$(cat primary_deps.yml | grep 'name:')"
+name_line="$(cat $input_file | grep 'name:')"
 IFS=':' read -ra name_arr <<< "$name_line"
 env_name="${name_arr[1]}"
 
@@ -19,7 +31,7 @@ echo "Building Conda environment: $env_name"
 export CONDA_ALWAYS_YES="true"
 conda activate base
 conda env remove --name $env_name
-conda env create --file primary_deps.yml
+conda env create --file $input_file
 
 # export new environment to environment.yml
 echo "Exporting environment $env_name to environment.tmp1"
@@ -39,7 +51,7 @@ while IFS='' read -r line; do
     fi
 done < environment.tmp1 > environment.tmp2
 echo "Creating final environment.yml with warning line"
-echo "# WARNING - DO NOT EDIT THIS FILE MANUALLY" > environment.yml
-echo "# To update, please modify 'primary_deps.yml' and then run the locking script 'create_and_lock_environment.sh'">> environment.yml
-cat environment.tmp2 >> environment.yml
+echo "# WARNING - DO NOT EDIT THIS FILE MANUALLY" > $output_file
+echo "# To update, please modify '$input_file' and then run the locking script 'create_and_lock_environment.sh'">> $output_file
+cat environment.tmp2 >> $output_file
 rm environment.tmp1 environment.tmp2
diff --git a/docs/source/datasets.md b/docs/source/datasets.md
@@ -168,6 +168,39 @@ input_folder = run_info.input_datasets[0]
 
 This is also true when running locally - if `local_folder` is not specified and an AzureML workspace can be found, then the dataset will be downloaded or mounted to the `target_folder`.
 
+### Inferring the location of a dataset on AML
+
+If one cannot / does not wish to specify the target folder on which the dataset will be mounted or downloaded, a `data_name` kwarg may be used instead of `target_folder`.
+This name may be referenced in an argument of the script, and AML will interpolate its value so it is readable at runtime.
+
+```python
+from health_azure import DatasetConfig, submit_to_azure_if_needed
+input_dataset = DatasetConfig(name="my_folder",
+                              datastore="my_datastore",
+                              use_mounting=True,
+                              data_name="mnist_dir",
+                             )
+run_info = submit_to_azure_if_needed(...,
+                                     input_datasets=[input_dataset],
+                                     script_params=["--data_dir", "${{inputs.mnist_dir}}"],
+                                    )
+```
+
+If `data_name` is used for an output dataset, the prefix `outputs.` should be used instead of `inputs.`:
+
+```python
+from health_azure import DatasetConfig, submit_to_azure_if_needed
+output_dataset = DatasetConfig(name="new_dataset",
+                               datastore="my_datastore",
+                               use_mounting=True,
+                               data_name="output_dir",
+                              )
+run_info = submit_to_azure_if_needed(...,
+                                     output_datasets=[output_dataset],
+                                     script_params=["--output_dir", "${{outputs.output_dir}}"],
+                                    )
+```
+
 ### Overwriting existing output datasets
 
 When creating an output dataset with the same name as an existing dataset, the default behaviour of `hi-ml` is to overwrite the existing datasets. This is as if a run fails during the upload stage, corrupt files may be created. Allowing overwriting means that these corrupt datasets will not cause errors. If you wish to disable this behaviour, it can be controlled using the `overwrite_existing` parameter (only available in sdk v1, hence setting `strictly_aml_v1=True`):

diff --git a/hi-ml-azure/.vscode/launch.json b/hi-ml-azure/.vscode/launch.json
@@ -4,6 +4,15 @@
     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
     "version": "0.2.0",
     "configurations": [
+        {
+            "name": "Python: Debug Tests",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "purpose": ["debug-test"],
+            "console": "integratedTerminal",
+            "justMyCode": false
+        },
         {
             "name": "Run example script in AzureML",
             "type": "python",

diff --git a/hi-ml-azure/Makefile b/hi-ml-azure/Makefile
@@ -72,7 +72,8 @@ pytest_fast:
 
 # run pytest with coverage on package
 # Output the slowest tests via the --durations flag.
-
+# For diagnostics, use this extra flag to output all captured stdout:
+# pytest -rP testazure/testazure/test_azure_util.py::test_download_run_file_during_run
 call_pytest_and_coverage:
 	pytest --durations=50 --cov=health_azure  --cov-branch --cov-report=html --cov-report=xml --cov-report=term-missing --cov-config=.coveragerc testazure
 
@@ -86,3 +87,22 @@ test_all: pip_test call_flake8 call_mypy call_pytest_and_coverage
 example: pip_local
 	echo 'edit src/health/azure/examples/elevate_this.py to reference your compute_cluster_name'
 	cd src/health/azure/examples; python elevate_this.py --azureml --message 'running example from makefile'
+
+# Create a local Conda environment
+env:
+	conda env create --file environment.yml
+
+# Install Conda from scratch
+conda:
+	wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+	bash Miniconda3-latest-Linux-x86_64.sh -b
+	rm Miniconda3-latest-Linux-x86_64.sh
+	conda update -y -n base conda
+	conda install -y -n base conda-libmamba-solver
+	conda config --set solver libmamba
+
+env_hello_world_lock:
+	../create_and_lock_environment.sh primary_deps_hello_world.yml environment_hello_world.yml
+
+env_hello_world:
+	conda env create --file environment_hello_world.yml
diff --git a/hi-ml-azure/README.md b/hi-ml-azure/README.md
@@ -0,0 +1,68 @@
+# HI-ML-Azure
+
+This folder contains the source code for PyPI package `hi-ml-azure`.
+
+## Testing an AzureML setup
+
+To test if your AzureML setup is correct, follow these steps to setup up Python on your local machine:
+
+- Change the working directory to `<repo_root>/hi-ml-azure`
+- Run `make conda` to install MiniConda
+- Run `make env_hello_world` to build a simple Python environment with the necessary packages
+- Run `conda activate hello_world` to activate the environment
+
+Then follow these steps to test the AzureML setup:
+
+- Download the `config.json` file from your AzureML workspace and place it in folder `<repo_root>/hi-ml-azure`
+  There is a `Download config.json` button once you expand the dropdown menu on the top-right of your AzureML workspace.
+  This is not in the core Azure portal, but only visible once you open `AzureML Studio` from the portal.
+  The file `config.json` should look like this:
+
+  ```json
+  {
+    "subscription_id": "your-subscription-id",
+    "resource_group": "your-resource-group",
+    "workspace_name": "your-workspace-name"
+  }
+  ```
+
+- To run the test script, you must have created a compute cluster in your AzureML workspace.
+  You can do this by clicking on `Compute` in the left-hand menu, selecting the "Compute clusters" tab, and
+  then `+ New` to create a new compute cluster. To run the test script, it is sufficient to use a cheap CPU-only VM
+  type, like `STANDARD_DS3_V2`. Give the cluster a name, and use the same name in the script below.
+- Log into Azure by running `az login` in the terminal.
+- Start the test script via `python hello_world.py --cluster <your_compute_cluster_name>`.
+  If successful, this will print out "Successfully queued run..." at the end, and a "Run URL" that points to your job.
+- Open the "Run URL" that was printed on the console in the browser to monitor the run.
+
+## Testing access to OpenAI from an AzureML job
+
+Requirements:
+
+- Your compute cluster has a managed identity assigned.
+- You have an OpenAI deployment URL and model name.
+- The managed identity has "Cognitive Services OpenAI User" access to the OpenAI deployment.
+
+Run the following script to test access to OpenAI from an AzureML job:
+
+```python
+python hello_world.py --cluster <your_compute_cluster_name> --openai_url <URL> --openai_model <Model>
+```
+
+If successful, this will print out the response from OpenAI.
+
+## Testing access to datasets
+
+Requirements:
+
+- You have a datastore registered in your AzureML workspace
+- You have a dataset registered in your AzureML workspace. For example, upload an empty file to a folder `hello_world`
+  in your storage account, and register this folder as a dataset called `hello_world` in your AzureML workspace.
+- You have a folder in your storage account for an output dataset. For that, upload an empty file to a folder
+  `hello_world_output` and register this folder as a dataset `hello_world_output` in your AzureML workspace.
+
+Then run the following script to test access to datasets:
+
+```python
+python hello_world.py --cluster <your_compute_cluster_name> --input_dataset hello_world --output_dataset hello_world_output
+```