Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DeepSpeed] Do not fail if batch size could not be inferred for logging #10438

Merged
merged 14 commits into from
Nov 16, 2021
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- The `monitor` argument in the `EarlyStopping` callback is no longer optional ([#10328](https://github.com/PyTorchLightning/pytorch-lightning/pull/10328))


-
- Do not fail if batch size could not be inferred for logging when using DeepSpeed ([#10438](https://github.com/PyTorchLightning/pytorch-lightning/issues/10438))


-
Expand Down
19 changes: 11 additions & 8 deletions pytorch_lightning/plugins/training_type/deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,7 @@ def __init__(
)
self._config_initialized = False
deepspeed.utils.logging.logger.setLevel(logging_level)
pl._logger.setLevel(logging_level)
SeanNaren marked this conversation as resolved.
Show resolved Hide resolved

self.remote_device = remote_device
self.load_full_weights = load_full_weights
Expand Down Expand Up @@ -618,11 +619,6 @@ def _format_batch_size_and_grad_accum_config(self):
)
self.config["gradient_accumulation_steps"] = self.lightning_module.trainer.accumulate_grad_batches
if "train_micro_batch_size_per_gpu" not in self.config:
rank_zero_warn(
"Inferring the batch size for internal deepspeed logging from the `train_dataloader()`. "
"If you require skipping this, please pass "
"`Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`"
)
batch_size = self._auto_select_batch_size()
self.config["train_micro_batch_size_per_gpu"] = batch_size
if "gradient_clipping" not in self.config:
Expand All @@ -634,9 +630,16 @@ def _auto_select_batch_size(self):
batch_size = 1
train_dl_source = self.lightning_module.trainer._data_connector._train_dataloader_source
if train_dl_source.is_defined():
train_dataloader = train_dl_source.dataloader()
if hasattr(train_dataloader, "batch_sampler"):
batch_size = train_dataloader.batch_sampler.batch_size
try:
carmocca marked this conversation as resolved.
Show resolved Hide resolved
train_dataloader = train_dl_source.dataloader()
if hasattr(train_dataloader, "batch_sampler"):
batch_size = train_dataloader.batch_sampler.batch_size
except Exception:
SeanNaren marked this conversation as resolved.
Show resolved Hide resolved
rank_zero_warn(
"Tried to Infer the batch size for internal deepspeed logging from the `train_dataloader()`. "
"To ensure DeepSpeed logging remains correct, please manually pass the plugin with the "
"batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`."
)
return batch_size

def _format_precision_config(self):
Expand Down
10 changes: 6 additions & 4 deletions tests/plugins/test_deepspeed_plugin.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import contextlib
import json
import logging
import os
from typing import Any, Dict, Optional
from unittest import mock
Expand Down Expand Up @@ -887,9 +888,9 @@ def test_deepspeed_warn_train_dataloader_called(tmpdir):
trainer.fit(model)


@RunIf(min_gpus=1, deepspeed=True, special=True)
@RunIf(min_gpus=1, deepspeed=True, special=False)
def test_deepspeed_setup_train_dataloader(tmpdir):
"""Test DeepSpeed works when setup is required to call, and the user passes the batch size manually."""
"""Test DeepSpeed works when setup is required to call in the DataModule."""

class TestSetupIsCalledDataModule(LightningDataModule):
def __init__(self):
Expand All @@ -914,12 +915,13 @@ def test_dataloader(self):
model = BoringModel()
trainer = Trainer(
default_root_dir=tmpdir,
strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=32),
strategy=DeepSpeedPlugin(logging_level=logging.INFO),
gpus=1,
fast_dev_run=True,
)
dm = TestSetupIsCalledDataModule()
trainer.fit(model, datamodule=dm)
with pytest.warns(UserWarning, match="Tried to Infer the batch size for internal deepspeed logging"):
trainer.fit(model, datamodule=dm)
trainer.test(model, datamodule=dm)


Expand Down