Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Upgrade to PyTorch 1.10 #585

Merged
merged 25 commits into from
Jan 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ jobs that run in AzureML.

### Changed
- ([#588](https://github.com/microsoft/InnerEye-DeepLearning/pull/588)) Replace SciPy with PIL.PngImagePlugin.PngImageFile to load png files.
- ([#585](https://github.com/microsoft/InnerEye-DeepLearning/pull/585)) Switching to PyTorch 1.10.0 and torchvision 0.11.1
- ([#576](https://github.com/microsoft/InnerEye-DeepLearning/pull/576)) The console output is no longer written to stdout.txt because AzureML handles that better now
- ([#531](https://github.com/microsoft/InnerEye-DeepLearning/pull/531)) Updated PL to 1.3.8, torchmetrics and pl-bolts and changed relevant metrics and SSL code API.
- ([#555](https://github.com/microsoft/InnerEye-DeepLearning/pull/555)) Make the SSLContainer compatible with new datasets
Expand Down
1 change: 1 addition & 0 deletions InnerEye/ML/configs/classification/DummyClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(self) -> None:
self.expected_image_size_zyx = (4, 5, 7)
# Trying to run DDP from the test suite hangs, hence restrict to single GPU.
self.max_num_gpus = 1
self.pl_deterministic = True

def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
return DatasetSplits.from_proportions(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(self) -> None:
self.expected_image_size_zyx = (4, 5, 7)
# Trying to run DDP from the test suite hangs, hence restrict to single GPU.
self.max_num_gpus = 1
self.pl_deterministic = True

def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
return DatasetSplits.from_proportions(
Expand Down
2 changes: 0 additions & 2 deletions InnerEye/ML/models/architectures/sequential/gru.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,8 @@ def __init__(self, input_size: int, hidden_size: int, use_layer_norm: bool = Fal
self.ln_n = nn.LayerNorm(self.hidden_size) if use_layer_norm else Identity()

def forward(self, input: torch.Tensor, hx: Optional[torch.Tensor] = None) -> torch.Tensor: # type: ignore
self.check_forward_input(input)
if hx is None:
hx = input.new_zeros(size=(input.size(0), self.hidden_size), requires_grad=False)
self.check_forward_hidden(input, hx)

ih = input.mm(self.weight_ih.t())
hh = hx.mm(self.weight_hh.t())
Expand Down
9 changes: 6 additions & 3 deletions InnerEye/ML/visualizers/model_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import numpy as np
import torch
from torch.utils.hooks import RemovableHandle
from torchprof.profile import Profile
import torch.profiler as profiler

from InnerEye.Common.common_util import logging_only_to_file
from InnerEye.Common.fixed_paths import DEFAULT_MODEL_SUMMARIES_DIR_PATH
Expand Down Expand Up @@ -189,12 +189,15 @@ def print_summary() -> None:

# Register the forward-pass hooks, profile the model, and restore its state
self.model.apply(self._register_hook)
with Profile(self.model, use_cuda=self.use_gpu) as prof:
activities = [profiler.ProfilerActivity.CPU]
if self.use_gpu:
activities.append(profiler.ProfilerActivity.CUDA)
with profiler.profile(activities=activities, record_shapes=True) as prof:
forward_preserve_state(self.model, input_tensors) # type: ignore

# Log the model summary: tensor shapes, num of parameters, memory requirement, and forward pass time
logging.info(self.model)
logging.info('\n' + prof.display(show_events=False))
logging.info('\n' + prof.key_averages().table())
print_summary()

# Remove the hooks via handles
Expand Down
1 change: 1 addition & 0 deletions Tests/ML/configs/ClassificationModelForTesting.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def __init__(self, conv_in_3d: bool = True, mean_teacher_model: bool = False) ->
self.conv_in_3d = conv_in_3d
# Trying to run DDP from the test suite hangs, hence restrict to single GPU.
self.max_num_gpus = 1
self.pl_deterministic = True

def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
return DatasetSplits.from_proportions(
Expand Down
1 change: 1 addition & 0 deletions Tests/ML/configs/ClassificationModelForTesting2D.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(self, conv_in_3d: bool = True, mean_teacher_model: bool = False) ->
)
self.expected_image_size_zyx = (5, 7)
self.conv_in_3d = conv_in_3d
self.pl_deterministic = True

def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
return DatasetSplits.from_proportions(
Expand Down
1 change: 1 addition & 0 deletions Tests/ML/configs/DummyModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def __init__(self, **kwargs: Any) -> None:
self.add_and_validate(kwargs)
# Trying to run DDP from the test suite hangs, hence restrict to single GPU.
self.max_num_gpus = 1
self.pl_deterministic = True

def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
return DatasetSplits(train=dataset_df[dataset_df.subject.isin(self.train_subject_ids)],
Expand Down
4 changes: 2 additions & 2 deletions Tests/ML/models/test_scalar_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,10 @@ def test_train_classification_model(class_name: str, test_output_dirs: OutputFol
f"""epoch,subject,prediction_target,model_output,label,data_split,cross_validation_split_index
0,S2,{class_name},0.529514,1,Train,-1
0,S4,{class_name},0.521659,0,Train,-1
1,S4,{class_name},0.521482,0,Train,-1
1,S2,{class_name},0.529475,1,Train,-1
2,S4,{class_name},0.521305,0,Train,-1
1,S4,{class_name},0.521482,0,Train,-1
2,S2,{class_name},0.529437,1,Train,-1
2,S4,{class_name},0.521305,0,Train,-1
3,S2,{class_name},0.529399,1,Train,-1
3,S4,{class_name},0.521128,0,Train,-1
"""
Expand Down
2 changes: 1 addition & 1 deletion Tests/ML/runners/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ def run_model_inference_train_and_test(test_output_dirs: OutputFolderForTests,
def test_logging_to_file(test_output_dirs: OutputFolderForTests) -> None:
# Log file should go to a new, non-existent folder, 2 levels deep
file_path = test_output_dirs.root_dir / "subdir1" / "subdir2" / "logfile.txt"
assert common_util.logging_to_file_handler is None
common_util.logging_to_file_handler = None
common_util.logging_to_file(file_path)
assert common_util.logging_to_file_handler is not None
log_line = "foo bar"
Expand Down
14 changes: 7 additions & 7 deletions Tests/ML/test_model_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,11 @@ def _mean_list(lists: List[List[float]]) -> List[float]:
train_config.check_exclusive = False

if machine_has_gpu:
expected_train_losses = [0.4552919, 0.4548529]
expected_val_losses = [0.455389, 0.455306]
expected_train_losses = [0.4554231, 0.4550124]
expected_val_losses = [0.4553894, 0.4553061]
else:
expected_train_losses = [0.4552919, 0.4548538]
expected_val_losses = [0.4553891, 0.4553060]
expected_train_losses = [0.4554231, 0.4550112]
expected_val_losses = [0.4553893, 0.4553061]
loss_absolute_tolerance = 1e-6
expected_learning_rates = [train_config.l_rate, 5.3589e-4]

Expand Down Expand Up @@ -154,7 +154,7 @@ def assert_all_close(metric: str, expected: List[float], **kwargs: Any) -> None:
# and be the same across 'region' and 'region_1' because they derive from the same Nifti files.
# The following values are read off directly from the results of compute_dice_across_patches in the training loop
# This checks that averages are computed correctly, and that metric computers are reset after each epoch.
train_voxels = [[82860.0, 83212.0, 83087.0], [82831.0, 82900.0, 83212.0]]
train_voxels = [[82765.0, 83212.0, 82740.0], [82831.0, 82647.0, 83255.0]]
val_voxels = [[82765.0, 83212.0], [82765.0, 83212.0]]
_check_voxel_count(model_training_result.train_results_per_epoch(), _mean_list(train_voxels), "Train")
_check_voxel_count(model_training_result.val_results_per_epoch(), _mean_list(val_voxels), "Val")
Expand All @@ -170,8 +170,8 @@ def assert_all_close(metric: str, expected: List[float], **kwargs: Any) -> None:
# The following values are read off directly from the results of compute_dice_across_patches in the
# training loop. Results are slightly different for GPU, hence use a larger tolerance there.
dice_tolerance = 1e-3 if machine_has_gpu else 4.5e-4
train_dice_region = [[0.0, 0.0, 4.0282e-04], [0.0372, 0.0388, 0.1091]]
train_dice_region1 = [[0.4785, 0.4807, 0.4834], [0.4832, 0.4800, 0.4628]]
train_dice_region = [[0.0, 0.0, 0.0], [0.0376, 0.0343, 0.1017]]
train_dice_region1 = [[0.4845, 0.4814, 0.4829], [0.4822, 0.4747, 0.4426]]
# There appears to be some amount of non-determinism here: When using a tolerance of 1e-4, we get occasional
# test failures on Linux in the cloud (not on Windows, not on AzureML) Unclear where it comes from. Even when
# failing here, the losses match up to the expected tolerance.
Expand Down
26 changes: 13 additions & 13 deletions Tests/ML/visualizers/test_model_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
# Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
# ------------------------------------------------------------------------------------------
import logging
import torch
from abc import ABC
from typing import List, Tuple

import torch

from InnerEye.Common.common_util import logging_to_stdout
from InnerEye.Common.common_util import logging_to_stdout, change_working_directory
from InnerEye.Common.fixed_paths import DEFAULT_MODEL_SUMMARIES_DIR_PATH
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.configs.classification.GlaucomaPublic import GlaucomaPublic
from InnerEye.ML.models.architectures.base_model import BaseSegmentationModel, CropSizeConstraints
from InnerEye.ML.models.architectures.classification.image_encoder_with_mlp import ImageEncoderWithMlp, \
Expand Down Expand Up @@ -79,19 +79,19 @@ def test_model_summary_on_classification2() -> None:
assert summarizer.n_trainable_params != 0


def test_log_model_summary_to_file() -> None:
def test_log_model_summary_to_file(test_output_dirs: OutputFolderForTests) -> None:
model = MyFavModel()
input_size = (16, 16, 32)
expected_log_file = DEFAULT_MODEL_SUMMARIES_DIR_PATH / "model_log001.txt"
if expected_log_file.exists():
with change_working_directory(test_output_dirs.root_dir):
expected_log_file = DEFAULT_MODEL_SUMMARIES_DIR_PATH / "model_log001.txt"
if expected_log_file.exists():
expected_log_file.unlink()
model.generate_model_summary(input_size, log_summaries_to_files=True)
assert expected_log_file.exists()
assert len(expected_log_file.read_text().splitlines()) >= 3
expected_log_file.unlink()
model.generate_model_summary(input_size, log_summaries_to_files=True)
assert expected_log_file.exists()
with expected_log_file.open() as inpt:
assert len(inpt.readlines()) >= 3
expected_log_file.unlink()
model.generate_model_summary(input_size, log_summaries_to_files=False)
assert not expected_log_file.exists()
model.generate_model_summary(input_size, log_summaries_to_files=False)
assert not expected_log_file.exists()


class MyFavModel(BaseSegmentationModel, ABC):
Expand Down
63 changes: 35 additions & 28 deletions Tests/SSL/test_ssl_containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,14 @@ def default_runner() -> Runner:
yaml_config_file=fixed_paths.SETTINGS_YAML_FILE)


common_test_args = ["", "--is_debug_model=True", "--num_epochs=1", "--ssl_training_batch_size=10",
common_test_args = ["",
"--is_debug_model=True",
"--num_epochs=1",
"--ssl_training_batch_size=10",
"--linear_head_batch_size=5",
"--num_workers=0"]
"--num_workers=0",
"--pl_deterministic"
""]


def _compare_stored_metrics(runner: Runner, expected_metrics: Dict[str, float], abs: float = 1e-5) -> None:
Expand Down Expand Up @@ -118,16 +123,17 @@ def test_innereye_ssl_container_cifar10_resnet_simclr() -> None:
assert isinstance(loaded_config.model.encoder.cnn_model, ResNet)

# Check the metrics that were recorded during training
expected_metrics = {
'simclr/train/loss': 3.423144578933716,
'simclr/learning_rate': 0.0,
'ssl_online_evaluator/train/loss': 2.6143882274627686,
'ssl_online_evaluator/train/online_AccuracyAtThreshold05': 0.0,
'epoch_started': 0.0,
'simclr/val/loss': 2.886892795562744,
'ssl_online_evaluator/val/loss': 2.2472469806671143,
'ssl_online_evaluator/val/AccuracyAtThreshold05': 0.20000000298023224
}
# Note: It is possible that after the PyTorch 1.10 upgrade, we can't get parity between local runs and runs on
# the hosted build agents. If that suspicion is confirmed, we need to add branching for local and cloud results.
expected_metrics = {'simclr/val/loss': 2.8736939430236816,
'ssl_online_evaluator/val/loss': 2.268489360809326,
'ssl_online_evaluator/val/AccuracyAtThreshold05': 0.20000000298023224,
'simclr/train/loss': 3.6261844635009766,
'simclr/learning_rate': 0.0,
'ssl_online_evaluator/train/loss': 3.1140503883361816,
'ssl_online_evaluator/train/online_AccuracyAtThreshold05': 0.0,
'epoch_started': 0.0}

_compare_stored_metrics(runner, expected_metrics, abs=5e-5)

# Check that the checkpoint contains both the optimizer for the embedding and for the linear head
Expand Down Expand Up @@ -205,22 +211,23 @@ def test_innereye_ssl_container_rsna() -> None:
assert loaded_config.datamodule_args[SSLDataModuleType.ENCODER].augmentation_params.augmentation.use_random_crop
assert loaded_config.datamodule_args[SSLDataModuleType.ENCODER].augmentation_params.augmentation.use_random_affine

expected_metrics = {
'byol/train/loss': 0.00401744619011879,
'byol/tau': 0.9899999499320984,
'byol/learning_rate/0/0': 0.0,
'byol/learning_rate/0/1': 0.0,
'ssl_online_evaluator/train/loss': 0.685592532157898,
'ssl_online_evaluator/train/online_AreaUnderRocCurve': 0.5,
'ssl_online_evaluator/train/online_AreaUnderPRCurve': 0.699999988079071,
'ssl_online_evaluator/train/online_AccuracyAtThreshold05': 0.4000000059604645,
'epoch_started': 0.0,
'byol/val/loss': -0.07644838094711304,
'ssl_online_evaluator/val/loss': 0.6965796947479248,
'ssl_online_evaluator/val/AreaUnderRocCurve': math.nan,
'ssl_online_evaluator/val/AreaUnderPRCurve': math.nan,
'ssl_online_evaluator/val/AccuracyAtThreshold05': 0.0
}
# Note: It is possible that after the PyTorch 1.10 upgrade, we can't get parity between local runs and runs on
# the hosted build agents. If that suspicion is confirmed, we need to add branching for local and cloud results.
expected_metrics = {'byol/val/loss': -0.07644861936569214,
'ssl_online_evaluator/val/loss': 0.6963790059089661,
'ssl_online_evaluator/val/AreaUnderRocCurve': math.nan,
'ssl_online_evaluator/val/AreaUnderPRCurve': math.nan,
'ssl_online_evaluator/val/AccuracyAtThreshold05': 0.0,
'byol/train/loss': 0.004017443861812353,
'byol/tau': 0.9899999499320984,
'byol/learning_rate/0/0': 0.0,
'byol/learning_rate/0/1': 0.0,
'ssl_online_evaluator/train/loss': 0.6938587427139282,
'ssl_online_evaluator/train/online_AreaUnderRocCurve': 0.5,
'ssl_online_evaluator/train/online_AreaUnderPRCurve': 0.6000000238418579,
'ssl_online_evaluator/train/online_AccuracyAtThreshold05': 0.20000000298023224,
'epoch_started': 0.0}

_compare_stored_metrics(runner, expected_metrics)

# Check that we are able to load the checkpoint and create classifier model
Expand Down
7 changes: 3 additions & 4 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@ dependencies:
- cudatoolkit=11.1
- pip=20.1.1
- python=3.7.3
- pytorch=1.8.0
- pytorch=1.10.0
- python-blosc=1.7.0
- torchvision=0.9.0
- torchvision=0.11.1
- pip:
- git+https://github.com/analysiscenter/radio.git@6d53e25#egg=radio
- azure-mgmt-resource==12.1.0
Expand All @@ -31,7 +31,7 @@ dependencies:
- joblib==0.16.0
- jupyter==1.0.0
- jupyter-client==6.1.5
- lightning-bolts==0.3.4
- lightning-bolts==0.4.0
- matplotlib==3.3.0
- mlflow==1.17.0
- monai==0.6.0
Expand Down Expand Up @@ -68,7 +68,6 @@ dependencies:
- tabulate==0.8.7
- tensorboard==2.3.0
- tensorboardX==2.1
- torchprof==1.3.3
- torchmetrics==0.6.0
- umap-learn==0.5.2
- yacs==0.1.8