Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Fix for stuck test set inference for LightningContainer models (#494)
Browse files Browse the repository at this point in the history
Jobs got stuck if they used PyTorch Lightning built-in metrics objects, which were then trying to synchronize across GPUs. Resolved by shutting down torch.distributed
  • Loading branch information
ant0nsc authored Jun 17, 2021
1 parent 227bb7d commit 9749954
Show file tree
Hide file tree
Showing 7 changed files with 67 additions and 21 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ created.
- ([#475](https://github.com/microsoft/InnerEye-DeepLearning/pull/475)) Bug in AML SDK meant that we could not train
any large models anymore because data loaders ran out of memory.
- ([#472](https://github.com/microsoft/InnerEye-DeepLearning/pull/472)) Correct model path for moving ensemble models.
- ([#494](https://github.com/microsoft/InnerEye-DeepLearning/pull/494)) Fix an issue where multi-node jobs for
LightningContainer models can get stuck at test set inference.

### Removed

Expand Down
14 changes: 11 additions & 3 deletions InnerEye/Azure/azure_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -332,9 +332,17 @@ def set_script_params_except_submit_flag(self) -> None:
arg = args[i]
if arg.startswith(submit_flag):
if len(arg) == len(submit_flag):
# The argument list contains something like ["--azureml", "True]: Skip 2 entries
i = i + 1
elif arg[len(submit_flag)] != "=":
# The commandline argument is "--azureml", with something possibly following: This can either be
# "--azureml True" or "--azureml --some_other_param"
if i < (len(args) - 1):
# If the next argument starts with a "-" then assume that it does not belong to the --azureml
# flag. If there is no "-", assume it belongs to the --azureml flag, and skip both
if not args[i + 1].startswith("-"):
i = i + 1
elif arg[len(submit_flag)] == "=":
# The commandline argument is "--azureml=True" or "--azureml=False": Continue with next arg
pass
else:
# The argument list contains a flag like "--azureml_foo": Keep that.
retained_args.append(arg)
else:
Expand Down
18 changes: 15 additions & 3 deletions InnerEye/ML/configs/other/HelloContainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from torch.optim import Adam, Optimizer
from torch.optim.lr_scheduler import StepLR, _LRScheduler
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning.metrics import MeanAbsoluteError

from InnerEye.Common import fixed_paths_for_tests
from InnerEye.ML.lightning_container import LightningContainer
Expand Down Expand Up @@ -75,6 +76,7 @@ def __init__(self) -> None:
super().__init__()
self.model = torch.nn.Linear(in_features=1, out_features=1, bias=True)
self.test_mse: List[torch.Tensor] = []
self.test_mae = MeanAbsoluteError()

def forward(self, x: torch.Tensor) -> torch.Tensor: # type: ignore
"""
Expand Down Expand Up @@ -142,6 +144,7 @@ def on_test_epoch_start(self) -> None:
test set (that is done in the test_step).
"""
self.test_mse = []
self.test_mae.reset()

def test_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> torch.Tensor: # type: ignore
"""
Expand All @@ -153,8 +156,15 @@ def test_step(self, batch: Dict[str, torch.Tensor], batch_idx: int) -> torch.Ten
:param batch_idx: The index (0, 1, ...) of the batch when the data loader is enumerated.
:return: The loss on the test data.
"""
loss = self.shared_step(batch)
input = batch["x"]
target = batch["y"]
prediction = self.forward(input)
# This illustrates two ways of computing metrics: Using standard torch
loss = torch.nn.functional.mse_loss(prediction, target)
self.test_mse.append(loss)
# Metrics computed using PyTorch Lightning objects. Note that these will, by default, attempt
# to synchronize across GPUs.
self.test_mae.update(preds=prediction, target=target)
return loss

def on_test_epoch_end(self) -> None:
Expand All @@ -166,6 +176,7 @@ def on_test_epoch_end(self) -> None:
"""
average_mse = torch.mean(torch.stack(self.test_mse))
Path("test_mse.txt").write_text(str(average_mse.item()))
Path("test_mae.txt").write_text(str(self.test_mae.compute()))


class HelloContainer(LightningContainer):
Expand Down Expand Up @@ -196,7 +207,8 @@ def get_data_module(self) -> LightningDataModule:
# training, and cook them into a nice looking report. Here, the report is a simple text file.
def create_report(self) -> None:
# This just prints out the test MSE, but you could also generate a Jupyter notebook here, for example.
test_mse = float(Path("test_mse.txt").read_text())
report = f"Performance on test set: MSE = {test_mse}"
test_mse = Path("test_mse.txt").read_text().strip()
test_mae = Path("test_mae.txt").read_text().strip()
report = f"Performance on test set: MSE = {test_mse}, MAE = {test_mae}"
print(report)
Path("report.txt").write_text(report)
4 changes: 4 additions & 0 deletions InnerEye/ML/run_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,10 @@ def run_inference_for_lightning_models(self, checkpoint_paths: List[Path]) -> No
# searching for Horovod
if ENV_OMPI_COMM_WORLD_RANK in os.environ:
del os.environ[ENV_OMPI_COMM_WORLD_RANK]
# From the training setup, torch still thinks that it should run in a distributed manner,
# and would block on some GPU operations. Hence, clean up distributed training.
if torch.distributed.is_initialized():
torch.distributed.destroy_process_group()
trainer, _ = create_lightning_trainer(self.container, num_nodes=1)
# When training models that are not built-in InnerEye models, we have no guarantee that they write
# files to the right folder. Best guess is to change the current working directory to where files should go.
Expand Down
9 changes: 3 additions & 6 deletions InnerEye/ML/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,12 +288,9 @@ def run_in_situ(self) -> None:
pytest_failures = f"Not all PyTest tests passed. See {results_file_path}"
raise ValueError(pytest_failures)
else:
# Set environment variables for multi-node training if needed.
# In particular, the multi-node environment variables should NOT be set in single node
# training, otherwise this might lead to errors with the c10 distributed backend
# (https://github.com/microsoft/InnerEye-DeepLearning/issues/395)
if self.azure_config.num_nodes > 1:
set_environment_variables_for_multi_node()
# Set environment variables for multi-node training if needed. This function will terminate early
# if it detects that it is not in a multi-node environment.
set_environment_variables_for_multi_node()
ml_runner = self.create_ml_runner()
ml_runner.setup()
ml_runner.start_logging_to_file()
Expand Down
19 changes: 13 additions & 6 deletions Tests/Azure/test_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,16 +146,23 @@ def assert_has_params(expected_args: str) -> None:
# Arguments are in the keys of the dictionary only, and should have been added in the right order
assert " ".join(s.script_params) == expected_args

with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}=True", "more"]):
with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}=True", "--more"]):
s.set_script_params_except_submit_flag()
assert_has_params("some --param 1 more")
with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}", "False", "more"]):
assert_has_params("some --param 1 --more")
with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}", "False", "--more"]):
s.set_script_params_except_submit_flag()
assert_has_params("some --param 1 more")
assert_has_params("some --param 1 --more")
# Using the new syntax for boolean flags
with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}", "--more"]):
s.set_script_params_except_submit_flag()
assert_has_params("some --param 1 --more")
with mock.patch("sys.argv", ["", "some", "--param", "1", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}"]):
s.set_script_params_except_submit_flag()
assert_has_params("some --param 1")
# Arguments where azureml is just the prefix should not be removed.
with mock.patch("sys.argv", ["", "some", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}foo", "False", "more"]):
with mock.patch("sys.argv", ["", "some", f"--{AZURECONFIG_SUBMIT_TO_AZUREML}foo", "False", "--more"]):
s.set_script_params_except_submit_flag()
assert_has_params(f"some --{AZURECONFIG_SUBMIT_TO_AZUREML}foo False more")
assert_has_params(f"some --{AZURECONFIG_SUBMIT_TO_AZUREML}foo False --more")


@pytest.mark.parametrize(["s", "expected"],
Expand Down
22 changes: 19 additions & 3 deletions docs/bring_your_own_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,17 @@ class MyDataModule(LightningDataModule):
# All data should be read from the folder given in self.root_path
self.root_path = root_path
def train_dataloader(self, *args, **kwargs) -> DataLoader:
...
# The data should be read off self.root_path
train_dataset = ...
return DataLoader(train_dataset, batch_size=5, num_workers=5)
def val_dataloader(self, *args, **kwargs) -> DataLoader:
# The data should be read off self.root_path
...
val_dataset = ...
return DataLoader(val_dataset, batch_size=5, num_workers=5)
def test_dataloader(self, *args, **kwargs) -> DataLoader:
# The data should be read off self.root_path
...
test_dataset = ...
return DataLoader(test_dataset, batch_size=5, num_workers=5)

class MyContainer(LightningContainer):
def __init__(self):
Expand Down Expand Up @@ -97,6 +101,18 @@ In the above example, training is done for 42 epochs. After the model is trained
via PyTorch Lightning's [built-in test functionality](https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html?highlight=trainer.test#test).
See below for an alternative way of running the evaluation on the test set.

### Data loaders
The example above creates `DataLoader` objects from a dataset. When creating those, you need to specify a batch size
(how many samples from your dataset will go into one minibatch), and a number of worker processes. Note that, by
default, data loading will happen in the main process, meaning that your GPU will sit idle while the CPU reads data
from disk. When specifying a number of workers, it will spawn processes that pre-fetch data from disk, and put them
into a queue, ready for the GPU to pick it up when it is done processing the current minibatch.

For more details, please see the documentation for
[DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader). There is also a
[tutorial describing the foundations of datasets and
data loaders](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html)

### Outputting files during training

The Lightning model returned by `create_model` needs to write its output files to the current working directory.
Expand Down

0 comments on commit 9749954

Please sign in to comment.