From 879f52c69ac722c67daa6e1b42677c2389da174d Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 9 Nov 2021 15:55:53 +0000
Subject: [PATCH 1/9] Try to infer logging batch size, else rollback

---
 .../plugins/training_type/deepspeed.py        | 23 ++++++++++---------
 tests/plugins/test_deepspeed_plugin.py        | 10 ++++----
 2 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 2464a8ba4eeca..448e658002dc2 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -462,9 +462,7 @@ def init_deepspeed(self):
         if self.zero_stage_3 and self.partition_module:
             # Ensure the entire model has been moved to the appropriate device
             dtype = torch.float16 if self.precision in (16, "mixed") else torch.float32
-            deepspeed.zero.Init(
-                module=model, remote_device=self.remote_device, pin_memory=True, config=self.config, dtype=dtype
-            )
+            deepspeed.zero.Init(module=model, pin_memory=True, config=self.config, dtype=dtype)
 
         if self.lightning_module.trainer and self.lightning_module.trainer.training:
             self._initialize_deepspeed_train(model)
@@ -618,11 +616,6 @@ def _format_batch_size_and_grad_accum_config(self):
             )
         self.config["gradient_accumulation_steps"] = self.lightning_module.trainer.accumulate_grad_batches
         if "train_micro_batch_size_per_gpu" not in self.config:
-            rank_zero_warn(
-                "Inferring the batch size for internal deepspeed logging from the `train_dataloader()`. "
-                "If you require skipping this, please pass "
-                "`Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`"
-            )
             batch_size = self._auto_select_batch_size()
             self.config["train_micro_batch_size_per_gpu"] = batch_size
         if "gradient_clipping" not in self.config:
@@ -634,9 +627,17 @@ def _auto_select_batch_size(self):
         batch_size = 1
         train_dl_source = self.lightning_module.trainer._data_connector._train_dataloader_source
         if train_dl_source.is_defined():
-            train_dataloader = train_dl_source.dataloader()
-            if hasattr(train_dataloader, "batch_sampler"):
-                batch_size = train_dataloader.batch_sampler.batch_size
+            try:
+                train_dataloader = train_dl_source.dataloader()
+                if hasattr(train_dataloader, "batch_sampler"):
+                    batch_size = train_dataloader.batch_sampler.batch_size
+            except Exception:
+                if deepspeed.utils.logging.logger.level < logging.WARN:
+                    rank_zero_warn(
+                        "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. "
+                        "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the"
+                        "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`"
+                    )
         return batch_size
 
     def _format_precision_config(self):
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 25f02a4c1eab5..9f93e0e701aa2 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -1,5 +1,6 @@
 import contextlib
 import json
+import logging
 import os
 from typing import Any, Dict, Optional
 from unittest import mock
@@ -887,9 +888,9 @@ def test_deepspeed_warn_train_dataloader_called(tmpdir):
         trainer.fit(model)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
+@RunIf(min_gpus=1, deepspeed=True, special=False)
 def test_deepspeed_setup_train_dataloader(tmpdir):
-    """Test DeepSpeed works when setup is required to call, and the user passes the batch size manually."""
+    """Test DeepSpeed works when setup is required to call in the DataModule."""
 
     class TestSetupIsCalledDataModule(LightningDataModule):
         def __init__(self):
@@ -914,12 +915,13 @@ def test_dataloader(self):
     model = BoringModel()
     trainer = Trainer(
         default_root_dir=tmpdir,
-        strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=32),
+        strategy=DeepSpeedPlugin(logging_level=logging.INFO),
         gpus=1,
         fast_dev_run=True,
     )
     dm = TestSetupIsCalledDataModule()
-    trainer.fit(model, datamodule=dm)
+    with pytest.warns(UserWarning, match="Tried to Infer the batch size for internal deepspeed logging"):
+        trainer.fit(model, datamodule=dm)
     trainer.test(model, datamodule=dm)
 
 

From 661c4c14f13e14e441adc37515a45b7cdd9a2512 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 9 Nov 2021 15:58:46 +0000
Subject: [PATCH 2/9] Add CHANGELOG.md

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a786af1c164e7..59667590441c4 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,7 +28,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Raise exception in `init_dist_connection()` when torch distibuted is not available ([#10418](https://github.com/PyTorchLightning/pytorch-lightning/issues/10418))
 
 
--
+- Do not fail if batch size could not be inferred for logging when using DeepSpeed ([#10438](https://github.com/PyTorchLightning/pytorch-lightning/issues/10438))
 
 
 -

From b3c31065975fdab783d0f5fbdb3c0ae4c36dc37d Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Tue, 9 Nov 2021 16:34:41 +0000
Subject: [PATCH 3/9] Woops

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 448e658002dc2..de3e46e444882 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -462,7 +462,9 @@ def init_deepspeed(self):
         if self.zero_stage_3 and self.partition_module:
             # Ensure the entire model has been moved to the appropriate device
             dtype = torch.float16 if self.precision in (16, "mixed") else torch.float32
-            deepspeed.zero.Init(module=model, pin_memory=True, config=self.config, dtype=dtype)
+            deepspeed.zero.Init(
+                module=model, remote_device=self.remote_device, pin_memory=True, config=self.config, dtype=dtype
+            )
 
         if self.lightning_module.trainer and self.lightning_module.trainer.training:
             self._initialize_deepspeed_train(model)

From 05bc2473970e4c9b16997f99ef21a3d6e53d7210 Mon Sep 17 00:00:00 2001
From: Sean Naren <sean@grid.ai>
Date: Tue, 9 Nov 2021 22:01:12 +0000
Subject: [PATCH 4/9] Update
 pytorch_lightning/plugins/training_type/deepspeed.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
---
 pytorch_lightning/plugins/training_type/deepspeed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index de3e46e444882..47843e34c37d0 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -637,8 +637,8 @@ def _auto_select_batch_size(self):
                 if deepspeed.utils.logging.logger.level < logging.WARN:
                     rank_zero_warn(
                         "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. "
-                        "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the"
-                        "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`"
+                        "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the "
+                        "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`."
                     )
         return batch_size
 

From 06f5bde2efdb3726d6b40de406da57c077e93552 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 10 Nov 2021 05:02:33 +0100
Subject: [PATCH 5/9] Configure our logger level

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 47843e34c37d0..b525bea95da96 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -312,6 +312,7 @@ def __init__(
             )
         self._config_initialized = False
         deepspeed.utils.logging.logger.setLevel(logging_level)
+        pl._logger.setLevel(logging_level)
 
         self.remote_device = remote_device
         self.load_full_weights = load_full_weights
@@ -634,12 +635,11 @@ def _auto_select_batch_size(self):
                 if hasattr(train_dataloader, "batch_sampler"):
                     batch_size = train_dataloader.batch_sampler.batch_size
             except Exception:
-                if deepspeed.utils.logging.logger.level < logging.WARN:
-                    rank_zero_warn(
-                        "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. "
-                        "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the "
-                        "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`."
-                    )
+                rank_zero_warn(
+                    "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. "
+                    "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the "
+                    "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`."
+                )
         return batch_size
 
     def _format_precision_config(self):

From 01e638b8dfd080d662a65e650b3f5adde19d3b71 Mon Sep 17 00:00:00 2001
From: Carlos Mocholi <carlossmocholi@gmail.com>
Date: Wed, 10 Nov 2021 05:05:57 +0100
Subject: [PATCH 6/9] Add comment

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index b525bea95da96..3a5e0dfdc9091 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -634,6 +634,8 @@ def _auto_select_batch_size(self):
                 train_dataloader = train_dl_source.dataloader()
                 if hasattr(train_dataloader, "batch_sampler"):
                     batch_size = train_dataloader.batch_sampler.batch_size
+            # broad exception on purpose as `source.dataloader()` will fail if the dataloader requires `setup`
+            # to have been called before
             except Exception:
                 rank_zero_warn(
                     "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. "

From bbe27b75c3d71b48111a43ca5a9c1668dff3a88b Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Mon, 15 Nov 2021 15:31:32 +0000
Subject: [PATCH 7/9] Revert "Configure our logger level"

This reverts commit 06f5bde2
---
 pytorch_lightning/plugins/training_type/deepspeed.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index 3a5e0dfdc9091..ab6c7ef223962 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -312,7 +312,6 @@ def __init__(
             )
         self._config_initialized = False
         deepspeed.utils.logging.logger.setLevel(logging_level)
-        pl._logger.setLevel(logging_level)
 
         self.remote_device = remote_device
         self.load_full_weights = load_full_weights
@@ -637,11 +636,12 @@ def _auto_select_batch_size(self):
             # broad exception on purpose as `source.dataloader()` will fail if the dataloader requires `setup`
             # to have been called before
             except Exception:
-                rank_zero_warn(
-                    "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. "
-                    "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the "
-                    "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`."
-                )
+                if deepspeed.utils.logging.logger.level < logging.WARN:
+                    rank_zero_warn(
+                        "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. "
+                        "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the "
+                        "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`."
+                    )
         return batch_size
 
     def _format_precision_config(self):

From f9e1a858ae0510582bd8a27377eba10f896cbed9 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Mon, 15 Nov 2021 15:53:56 +0000
Subject: [PATCH 8/9] Fix test

---
 pytorch_lightning/plugins/training_type/deepspeed.py | 6 +++---
 tests/plugins/test_deepspeed_plugin.py               | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
index ab6c7ef223962..70bc3493675c5 100644
--- a/pytorch_lightning/plugins/training_type/deepspeed.py
+++ b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -636,9 +636,9 @@ def _auto_select_batch_size(self):
             # broad exception on purpose as `source.dataloader()` will fail if the dataloader requires `setup`
             # to have been called before
             except Exception:
-                if deepspeed.utils.logging.logger.level < logging.WARN:
-                    rank_zero_warn(
-                        "Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. "
+                if self.global_rank == 0:
+                    deepspeed.utils.logging.logger.warning(
+                        "Tried to infer the batch size for internal deepspeed logging from the `train_dataloader()`. "
                         "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the "
                         "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`."
                     )
diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index 9f93e0e701aa2..bb8f530338133 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -888,7 +888,7 @@ def test_deepspeed_warn_train_dataloader_called(tmpdir):
         trainer.fit(model)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=False)
+@RunIf(min_gpus=1, deepspeed=True, special=True)
 def test_deepspeed_setup_train_dataloader(tmpdir):
     """Test DeepSpeed works when setup is required to call in the DataModule."""
 
@@ -920,9 +920,9 @@ def test_dataloader(self):
         fast_dev_run=True,
     )
     dm = TestSetupIsCalledDataModule()
-    with pytest.warns(UserWarning, match="Tried to Infer the batch size for internal deepspeed logging"):
+    with mock.patch("deepspeed.utils.logging.logger.warning", autospec=True) as mock_object:
         trainer.fit(model, datamodule=dm)
-    trainer.test(model, datamodule=dm)
+    assert any("Tried to infer the batch size" in str(arg) for arg in mock_object.call_args_list)
 
 
 @mock.patch("torch.optim.lr_scheduler.StepLR.step", autospec=True)

From c8ce0945adddf182ed8abf1d0d048e9422d14430 Mon Sep 17 00:00:00 2001
From: SeanNaren <sean@grid.ai>
Date: Mon, 15 Nov 2021 16:48:30 +0000
Subject: [PATCH 9/9] Remove test

---
 tests/plugins/test_deepspeed_plugin.py | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/tests/plugins/test_deepspeed_plugin.py b/tests/plugins/test_deepspeed_plugin.py
index bb8f530338133..b35339487dac1 100644
--- a/tests/plugins/test_deepspeed_plugin.py
+++ b/tests/plugins/test_deepspeed_plugin.py
@@ -873,21 +873,6 @@ def training_step(self, batch, batch_idx):
         trainer.fit(model)
 
 
-@RunIf(min_gpus=1, deepspeed=True, special=True)
-def test_deepspeed_warn_train_dataloader_called(tmpdir):
-    """Test DeepSpeed warns when it calls ``lightning_module.train_dataloader`` internally for logging batch
-    size."""
-    model = BoringModel()
-    trainer = Trainer(
-        default_root_dir=tmpdir,
-        strategy=DeepSpeedPlugin(),
-        gpus=1,
-        fast_dev_run=True,
-    )
-    with pytest.warns(UserWarning, match="Inferring the batch size for internal deepspeed logging"):
-        trainer.fit(model)
-
-
 @RunIf(min_gpus=1, deepspeed=True, special=True)
 def test_deepspeed_setup_train_dataloader(tmpdir):
     """Test DeepSpeed works when setup is required to call in the DataModule."""