diff --git a/CHANGELOG.md b/CHANGELOG.md index 572b30151..7b0899f96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,9 +40,12 @@ any large models anymore because data loaders ran out of memory. LightningContainer models can get stuck at test set inference. - ([#498](https://github.com/microsoft/InnerEye-DeepLearning/pull/498)) Workaround for the problem that downloading multiple large checkpoints can time out. +- ([#515](https://github.com/microsoft/InnerEye-DeepLearning/pull/515)) Workaround for occasional issues with dataset +mounting and running matplotblib on some machines. Re-instantiated a disabled test. ### Removed +- ([#542](https://github.com/microsoft/InnerEye-DeepLearning/pull/542)) Removed Windows test leg from build pipeline. ### Deprecated diff --git a/InnerEye/Azure/azure_runner.py b/InnerEye/Azure/azure_runner.py index 1b9784d09..888623c35 100644 --- a/InnerEye/Azure/azure_runner.py +++ b/InnerEye/Azure/azure_runner.py @@ -229,6 +229,9 @@ def get_or_create_python_environment(azure_config: AzureConfig, # Occasionally uploading data during the run takes too long, and makes the job fail. Default is 300. "AZUREML_RUN_KILL_SIGNAL_TIMEOUT_SEC": "900", "MKL_SERVICE_FORCE_INTEL": "1", + # Switching to a new software stack in AML for mounting datasets + "RSLEX_DIRECT_VOLUME_MOUNT": "true", + "RSLEX_DIRECT_VOLUME_MOUNT_MAX_CACHE_SIZE": "1", **(source_config.environment_variables or {}) } base_image = "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04" diff --git a/InnerEye/ML/runner.py b/InnerEye/ML/runner.py index 9ba94d2d4..2ab2c6f4b 100755 --- a/InnerEye/ML/runner.py +++ b/InnerEye/ML/runner.py @@ -7,6 +7,8 @@ import warnings from pathlib import Path +import matplotlib + # Suppress all errors here because the imports after code cause loads of warnings. We can't specifically suppress # individual warnings only. # flake8: noqa @@ -68,11 +70,12 @@ def initialize_rpdb() -> None: f"kill -TRAP ; nc 127.0.0.1 {rpdb_port}") -def suppress_logging_noise() -> None: +def package_setup_and_hacks() -> None: """ - Reduce the logging level for some of the used libraries, which are particularly talkative in DEBUG mode. - Usually when running in DEBUG mode, we want diagnostics about the model building itself, but not for the - underlying libraries. + Set up the Python packages where needed. In particular, reduce the logging level for some of the used + libraries, which are particularly talkative in DEBUG mode. Usually when running in DEBUG mode, we want + diagnostics about the model building itself, but not for the underlying libraries. + It also adds workarounds for known issues in some packages. """ # Numba code generation is extremely talkative in DEBUG mode, disable that. logging.getLogger('numba').setLevel(logging.WARNING) @@ -89,6 +92,10 @@ def suppress_logging_noise() -> None: # This is working around a spurious error message thrown by MKL, see # https://github.com/pytorch/pytorch/issues/37377 os.environ['MKL_THREADING_LAYER'] = 'GNU' + # Workaround for issues with matplotlib on some X servers, see + # https://stackoverflow.com/questions/45993879/matplot-lib-fatal-io-error-25-inappropriate-ioctl-for-device-on-x + # -server-loc + matplotlib.use('Agg') class Runner: @@ -279,7 +286,7 @@ def run_in_situ(self) -> None: # build itself, but not the tons of debug information that AzureML submissions create. # Suppress the logging from all processes but the one for GPU 0 on each node, to make log files more readable logging_to_stdout(self.azure_config.log_level if is_local_rank_zero() else "ERROR") - suppress_logging_noise() + package_setup_and_hacks() if is_global_rank_zero(): self.print_git_tags() # For the PR build in AzureML, we can either pytest, or the training of the simple PR model. Running both diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py index 02faeafb6..cc13155c1 100644 --- a/InnerEye/ML/utils/io_util.py +++ b/InnerEye/ML/utils/io_util.py @@ -2,23 +2,22 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. # ------------------------------------------------------------------------------------------ +import shutil +import uuid from copy import copy from dataclasses import dataclass from enum import Enum from pathlib import Path -import shutil from typing import Dict, Generic, Iterable, List, Optional, Tuple, Type, TypeVar, Union -import uuid - -import h5py -from numpy.lib.npyio import NpzFile -from skimage.transform import resize import SimpleITK as sitk +import h5py import numpy as np import pandas as pd import pydicom as dicom import torch +from numpy.lib.npyio import NpzFile +from skimage.transform import resize from tabulate import tabulate from InnerEye.Common import common_util diff --git a/README.md b/README.md index 2d26806a6..c5eb2c5ae 100644 --- a/README.md +++ b/README.md @@ -132,15 +132,10 @@ Details can be found [here](docs/deploy_on_aml.md). ## Contact -Please send an email to InnerEyeInfo@microsoft.com if you would like further information about this project. - If you have any feature requests, or find issues in the code, please create an [issue on GitHub](https://github.com/microsoft/InnerEye-DeepLearning/issues). -If you are interested in using the InnerEye Deep Learning Toolkit to develop your own products and services, -please email InnerEyeCommercial@microsoft.com. We can also provide input on using the toolbox with -[Azure Stack Hub](https://azure.microsoft.com/en-us/products/azure-stack/hub/), a hybrid cloud solution -that allows for on-premise medical image analysis that complies with data handling regulations. +Please send an email to InnerEyeInfo@microsoft.com if you would like further information about this project. ## Publications diff --git a/Tests/AfterTraining/test_after_training.py b/Tests/AfterTraining/test_after_training.py index 2859c9568..e15ebd69b 100644 --- a/Tests/AfterTraining/test_after_training.py +++ b/Tests/AfterTraining/test_after_training.py @@ -369,7 +369,6 @@ def test_training_2nodes(test_output_dirs: OutputFolderForTests) -> None: @pytest.mark.after_training_2node -@pytest.mark.skip("Test times out for unknown reasons.") def test_recovery_on_2_nodes(test_output_dirs: OutputFolderForTests) -> None: args_list = ["--model", "BasicModel2EpochsMoreData", "--azureml", "True", diff --git a/azure-pipelines/build-pr.yml b/azure-pipelines/build-pr.yml index 7dfe66a8f..47dd231f0 100644 --- a/azure-pipelines/build-pr.yml +++ b/azure-pipelines/build-pr.yml @@ -21,7 +21,7 @@ jobs: pool: vmImage: 'windows-2019' steps: - - template: build.yaml + - template: build_windows.yaml - job: Linux pool: diff --git a/azure-pipelines/build.yaml b/azure-pipelines/build.yaml index 1a5d8dfeb..a03b3f5a7 100644 --- a/azure-pipelines/build.yaml +++ b/azure-pipelines/build.yaml @@ -1,19 +1,6 @@ steps: - template: checkout.yml - - task: CredScan@3 - condition: and(succeeded(), eq( variables['Agent.OS'], 'Windows_NT' )) - - - task: PostAnalysis@1 - condition: and(succeeded(), eq( variables['Agent.OS'], 'Windows_NT' )) - displayName: 'Post Analysis' - inputs: - CredScan: true - - - script: echo %NUMBER_OF_PROCESSORS% - condition: and(succeeded(), eq( variables['Agent.OS'], 'Windows_NT' )) - displayName: Print processors - - bash: | conda env create --file environment.yml --name InnerEye --quiet source activate InnerEye diff --git a/azure-pipelines/build_windows.yaml b/azure-pipelines/build_windows.yaml new file mode 100644 index 000000000..25b67bac6 --- /dev/null +++ b/azure-pipelines/build_windows.yaml @@ -0,0 +1,20 @@ +steps: + - template: checkout_windows.yml + + - task: CredScan@3 + condition: and(succeeded(), eq( variables['Agent.OS'], 'Windows_NT' )) + + - task: PostAnalysis@1 + condition: and(succeeded(), eq( variables['Agent.OS'], 'Windows_NT' )) + displayName: 'Post Analysis' + inputs: + CredScan: true + + - task: ComponentGovernanceComponentDetection@0 + condition: succeeded() + inputs: + scanType: 'Register' + verbosity: 'Normal' + alertWarningLevel: 'High' + failOnAlert: true + failOnStderr: true diff --git a/azure-pipelines/checkout.yml b/azure-pipelines/checkout.yml index 88665f705..05efcfd5f 100644 --- a/azure-pipelines/checkout.yml +++ b/azure-pipelines/checkout.yml @@ -4,10 +4,7 @@ steps: submodules: true - bash: | - if [ $(Agent.OS) = 'Windows_NT' ] - then subdir=Scripts - else subdir=bin - fi + subdir=bin echo "Adding this directory to PATH: $CONDA/$subdir" echo "##vso[task.prependpath]$CONDA/$subdir" displayName: Add conda to PATH @@ -19,7 +16,6 @@ steps: conda list displayName: Print conda version and initial package list - # Linux only; not needed for Windows - bash: | sudo chown -R $USER /usr/share/miniconda condition: and(succeeded(), eq( variables['Agent.OS'], 'Linux' )) diff --git a/azure-pipelines/checkout_windows.yml b/azure-pipelines/checkout_windows.yml new file mode 100644 index 000000000..44a0a26b9 --- /dev/null +++ b/azure-pipelines/checkout_windows.yml @@ -0,0 +1,17 @@ +steps: + - checkout: self + lfs: true + submodules: true + + - bash: | + subdir=Scripts + echo "Adding this directory to PATH: $CONDA/$subdir" + echo "##vso[task.prependpath]$CONDA/$subdir" + displayName: Add conda to PATH + condition: succeeded() + + - bash: | + conda install conda=4.8.3 -y + conda --version + conda list + displayName: Print conda version and initial package list