From 879f52c69ac722c67daa6e1b42677c2389da174d Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 9 Nov 2021 15:55:53 +0000 Subject: [PATCH 1/9] Try to infer logging batch size, else rollback --- .../plugins/training_type/deepspeed.py | 23 ++++++++++--------- tests/plugins/test_deepspeed_plugin.py | 10 ++++---- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 2464a8ba4eeca..448e658002dc2 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -462,9 +462,7 @@ def init_deepspeed(self): if self.zero_stage_3 and self.partition_module: # Ensure the entire model has been moved to the appropriate device dtype = torch.float16 if self.precision in (16, "mixed") else torch.float32 - deepspeed.zero.Init( - module=model, remote_device=self.remote_device, pin_memory=True, config=self.config, dtype=dtype - ) + deepspeed.zero.Init(module=model, pin_memory=True, config=self.config, dtype=dtype) if self.lightning_module.trainer and self.lightning_module.trainer.training: self._initialize_deepspeed_train(model) @@ -618,11 +616,6 @@ def _format_batch_size_and_grad_accum_config(self): ) self.config["gradient_accumulation_steps"] = self.lightning_module.trainer.accumulate_grad_batches if "train_micro_batch_size_per_gpu" not in self.config: - rank_zero_warn( - "Inferring the batch size for internal deepspeed logging from the `train_dataloader()`. " - "If you require skipping this, please pass " - "`Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`" - ) batch_size = self._auto_select_batch_size() self.config["train_micro_batch_size_per_gpu"] = batch_size if "gradient_clipping" not in self.config: @@ -634,9 +627,17 @@ def _auto_select_batch_size(self): batch_size = 1 train_dl_source = self.lightning_module.trainer._data_connector._train_dataloader_source if train_dl_source.is_defined(): - train_dataloader = train_dl_source.dataloader() - if hasattr(train_dataloader, "batch_sampler"): - batch_size = train_dataloader.batch_sampler.batch_size + try: + train_dataloader = train_dl_source.dataloader() + if hasattr(train_dataloader, "batch_sampler"): + batch_size = train_dataloader.batch_sampler.batch_size + except Exception: + if deepspeed.utils.logging.logger.level < logging.WARN: + rank_zero_warn( + "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. " + "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the" + "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`" + ) return batch_size def _format_precision_config(self): diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 25f02a4c1eab5..9f93e0e701aa2 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -1,5 +1,6 @@ import contextlib import json +import logging import os from typing import Any, Dict, Optional from unittest import mock @@ -887,9 +888,9 @@ def test_deepspeed_warn_train_dataloader_called(tmpdir): trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) +@RunIf(min_gpus=1, deepspeed=True, special=False) def test_deepspeed_setup_train_dataloader(tmpdir): - """Test DeepSpeed works when setup is required to call, and the user passes the batch size manually.""" + """Test DeepSpeed works when setup is required to call in the DataModule.""" class TestSetupIsCalledDataModule(LightningDataModule): def __init__(self): @@ -914,12 +915,13 @@ def test_dataloader(self): model = BoringModel() trainer = Trainer( default_root_dir=tmpdir, - strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=32), + strategy=DeepSpeedPlugin(logging_level=logging.INFO), gpus=1, fast_dev_run=True, ) dm = TestSetupIsCalledDataModule() - trainer.fit(model, datamodule=dm) + with pytest.warns(UserWarning, match="Tried to Infer the batch size for internal deepspeed logging"): + trainer.fit(model, datamodule=dm) trainer.test(model, datamodule=dm) From 661c4c14f13e14e441adc37515a45b7cdd9a2512 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 9 Nov 2021 15:58:46 +0000 Subject: [PATCH 2/9] Add CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a786af1c164e7..59667590441c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,7 +28,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Raise exception in `init_dist_connection()` when torch distibuted is not available ([#10418](https://github.com/PyTorchLightning/pytorch-lightning/issues/10418)) -- +- Do not fail if batch size could not be inferred for logging when using DeepSpeed ([#10438](https://github.com/PyTorchLightning/pytorch-lightning/issues/10438)) - From b3c31065975fdab783d0f5fbdb3c0ae4c36dc37d Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Tue, 9 Nov 2021 16:34:41 +0000 Subject: [PATCH 3/9] Woops --- pytorch_lightning/plugins/training_type/deepspeed.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 448e658002dc2..de3e46e444882 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -462,7 +462,9 @@ def init_deepspeed(self): if self.zero_stage_3 and self.partition_module: # Ensure the entire model has been moved to the appropriate device dtype = torch.float16 if self.precision in (16, "mixed") else torch.float32 - deepspeed.zero.Init(module=model, pin_memory=True, config=self.config, dtype=dtype) + deepspeed.zero.Init( + module=model, remote_device=self.remote_device, pin_memory=True, config=self.config, dtype=dtype + ) if self.lightning_module.trainer and self.lightning_module.trainer.training: self._initialize_deepspeed_train(model) From 05bc2473970e4c9b16997f99ef21a3d6e53d7210 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Tue, 9 Nov 2021 22:01:12 +0000 Subject: [PATCH 4/9] Update pytorch_lightning/plugins/training_type/deepspeed.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- pytorch_lightning/plugins/training_type/deepspeed.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index de3e46e444882..47843e34c37d0 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -637,8 +637,8 @@ def _auto_select_batch_size(self): if deepspeed.utils.logging.logger.level < logging.WARN: rank_zero_warn( "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. " - "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the" - "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`" + "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the " + "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`." ) return batch_size From 06f5bde2efdb3726d6b40de406da57c077e93552 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 10 Nov 2021 05:02:33 +0100 Subject: [PATCH 5/9] Configure our logger level --- pytorch_lightning/plugins/training_type/deepspeed.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 47843e34c37d0..b525bea95da96 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -312,6 +312,7 @@ def __init__( ) self._config_initialized = False deepspeed.utils.logging.logger.setLevel(logging_level) + pl._logger.setLevel(logging_level) self.remote_device = remote_device self.load_full_weights = load_full_weights @@ -634,12 +635,11 @@ def _auto_select_batch_size(self): if hasattr(train_dataloader, "batch_sampler"): batch_size = train_dataloader.batch_sampler.batch_size except Exception: - if deepspeed.utils.logging.logger.level < logging.WARN: - rank_zero_warn( - "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. " - "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the " - "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`." - ) + rank_zero_warn( + "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. " + "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the " + "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`." + ) return batch_size def _format_precision_config(self): From 01e638b8dfd080d662a65e650b3f5adde19d3b71 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Wed, 10 Nov 2021 05:05:57 +0100 Subject: [PATCH 6/9] Add comment --- pytorch_lightning/plugins/training_type/deepspeed.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index b525bea95da96..3a5e0dfdc9091 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -634,6 +634,8 @@ def _auto_select_batch_size(self): train_dataloader = train_dl_source.dataloader() if hasattr(train_dataloader, "batch_sampler"): batch_size = train_dataloader.batch_sampler.batch_size + # broad exception on purpose as `source.dataloader()` will fail if the dataloader requires `setup` + # to have been called before except Exception: rank_zero_warn( "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. " From bbe27b75c3d71b48111a43ca5a9c1668dff3a88b Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 15 Nov 2021 15:31:32 +0000 Subject: [PATCH 7/9] Revert "Configure our logger level" This reverts commit 06f5bde2 --- pytorch_lightning/plugins/training_type/deepspeed.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index 3a5e0dfdc9091..ab6c7ef223962 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -312,7 +312,6 @@ def __init__( ) self._config_initialized = False deepspeed.utils.logging.logger.setLevel(logging_level) - pl._logger.setLevel(logging_level) self.remote_device = remote_device self.load_full_weights = load_full_weights @@ -637,11 +636,12 @@ def _auto_select_batch_size(self): # broad exception on purpose as `source.dataloader()` will fail if the dataloader requires `setup` # to have been called before except Exception: - rank_zero_warn( - "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. " - "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the " - "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`." - ) + if deepspeed.utils.logging.logger.level < logging.WARN: + rank_zero_warn( + "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. " + "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the " + "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`." + ) return batch_size def _format_precision_config(self): From f9e1a858ae0510582bd8a27377eba10f896cbed9 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 15 Nov 2021 15:53:56 +0000 Subject: [PATCH 8/9] Fix test --- pytorch_lightning/plugins/training_type/deepspeed.py | 6 +++--- tests/plugins/test_deepspeed_plugin.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index ab6c7ef223962..70bc3493675c5 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -636,9 +636,9 @@ def _auto_select_batch_size(self): # broad exception on purpose as `source.dataloader()` will fail if the dataloader requires `setup` # to have been called before except Exception: - if deepspeed.utils.logging.logger.level < logging.WARN: - rank_zero_warn( - "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. " + if self.global_rank == 0: + deepspeed.utils.logging.logger.warning( + "Tried to infer the batch size for internal deepspeed logging from the `train_dataloader()`. " "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the " "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`." ) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index 9f93e0e701aa2..bb8f530338133 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -888,7 +888,7 @@ def test_deepspeed_warn_train_dataloader_called(tmpdir): trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=False) +@RunIf(min_gpus=1, deepspeed=True, special=True) def test_deepspeed_setup_train_dataloader(tmpdir): """Test DeepSpeed works when setup is required to call in the DataModule.""" @@ -920,9 +920,9 @@ def test_dataloader(self): fast_dev_run=True, ) dm = TestSetupIsCalledDataModule() - with pytest.warns(UserWarning, match="Tried to Infer the batch size for internal deepspeed logging"): + with mock.patch("deepspeed.utils.logging.logger.warning", autospec=True) as mock_object: trainer.fit(model, datamodule=dm) - trainer.test(model, datamodule=dm) + assert any("Tried to infer the batch size" in str(arg) for arg in mock_object.call_args_list) @mock.patch("torch.optim.lr_scheduler.StepLR.step", autospec=True) From c8ce0945adddf182ed8abf1d0d048e9422d14430 Mon Sep 17 00:00:00 2001 From: SeanNaren Date: Mon, 15 Nov 2021 16:48:30 +0000 Subject: [PATCH 9/9] Remove test --- tests/plugins/test_deepspeed_plugin.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py index bb8f530338133..b35339487dac1 100644 --- a/tests/plugins/test_deepspeed_plugin.py +++ b/tests/plugins/test_deepspeed_plugin.py @@ -873,21 +873,6 @@ def training_step(self, batch, batch_idx): trainer.fit(model) -@RunIf(min_gpus=1, deepspeed=True, special=True) -def test_deepspeed_warn_train_dataloader_called(tmpdir): - """Test DeepSpeed warns when it calls ``lightning_module.train_dataloader`` internally for logging batch - size.""" - model = BoringModel() - trainer = Trainer( - default_root_dir=tmpdir, - strategy=DeepSpeedPlugin(), - gpus=1, - fast_dev_run=True, - ) - with pytest.warns(UserWarning, match="Inferring the batch size for internal deepspeed logging"): - trainer.fit(model) - - @RunIf(min_gpus=1, deepspeed=True, special=True) def test_deepspeed_setup_train_dataloader(tmpdir): """Test DeepSpeed works when setup is required to call in the DataModule."""