Skip to content

Commit

Permalink
Merge branch 'master' into bugfix/horovodrun-detection
Browse files Browse the repository at this point in the history
  • Loading branch information
awaelchli committed Jul 30, 2021
2 parents 335bf50 + c99e2fe commit a1542b3
Show file tree
Hide file tree
Showing 15 changed files with 91 additions and 119 deletions.
12 changes: 7 additions & 5 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,28 +58,30 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Removed deprecated `metrics` ([#8586](https://github.com/PyTorchLightning/pytorch-lightning/pull/8586/))


- Removed the `outputs` argument in both the `LightningModule.on_train_epoch_end` and `Callback.on_train_epoch_end` hooks ([#8587](https://github.com/PyTorchLightning/pytorch-lightning/pull/8587))
- Removed the deprecated `outputs` argument in both the `LightningModule.on_train_epoch_end` and `Callback.on_train_epoch_end` hooks ([#8587](https://github.com/PyTorchLightning/pytorch-lightning/pull/8587))


-

- Delete the deprecated `TrainerLoggingMixin` class ([#8609](https://github.com/PyTorchLightning/pytorch-lightning/pull/8609))

-


-
- Removed the deprecated `optimizer_idx` from `training_step` as an accepted argument in manual optimization ([#8576](https://github.com/PyTorchLightning/pytorch-lightning/pull/8576))


### Fixed

- Fixed horovod auto-detection when horovod is not installed and the launcher is `mpirun` ([#8610](https://github.com/PyTorchLightning/pytorch-lightning/pull/8610))

### Fixed

-


-


- Fixed `trainer.fit_loop.split_idx` always returning `None` ([#8601](https://github.com/PyTorchLightning/pytorch-lightning/pull/8601))

-


Expand Down
5 changes: 4 additions & 1 deletion pytorch_lightning/callbacks/stochastic_weight_avg.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,10 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo
scheduler_cfg = trainer.lr_schedulers[0]
if scheduler_cfg["interval"] != "epoch" or scheduler_cfg["frequency"] != 1:
rank_zero_warn(f"SWA is currently only supported every epoch. Found {scheduler_cfg}")
rank_zero_info(f"Swapping scheduler {scheduler_cfg['scheduler']} for {self._swa_scheduler}")
rank_zero_info(
f"Swapping scheduler `{scheduler_cfg['scheduler'].__class__.__name__}`"
f" for `{self._swa_scheduler.__class__.__name__}`"
)
trainer.lr_schedulers[0] = default_scheduler_cfg
else:
trainer.lr_schedulers.append(default_scheduler_cfg)
Expand Down
22 changes: 12 additions & 10 deletions pytorch_lightning/loops/batch/training_batch_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(self) -> None:
self.accumulated_loss: Optional[Tensor] = None
self.batch_outputs: Optional[List[List[STEP_OUTPUT]]] = None
self.running_loss: TensorRunningAccum = TensorRunningAccum(window_length=20)
# the current split index when the batch gets split into chunks in truncated backprop through time
self.split_idx: Optional[int] = None
self.optim_progress = OptimizationProgress()

Expand Down Expand Up @@ -159,19 +160,20 @@ def num_active_optimizers(self, batch_idx: Optional[int] = None) -> int:
return len(self.get_active_optimizers(batch_idx))

def _run_optimization(
self, batch_idx: int, split_batch: Any, opt_idx: int = 0, optimizer: Optional[torch.optim.Optimizer] = None
self,
batch_idx: int,
split_batch: Any,
opt_idx: Optional[int] = None,
optimizer: Optional[torch.optim.Optimizer] = None,
):
"""Runs closure (train step + backward) together with optimization if necessary.
Args:
batch_idx: the index of the current batch
split_batch: the current tbptt split of the whole batch
opt_idx: the index of the current optimizer
optimizer: the current optimizer
opt_idx: the index of the current optimizer or `None` in case of manual optimization
optimizer: the current optimizer or `None` in case of manual optimization
"""
# TODO(@awaelchli): In v1.5, when optimizer_idx gets removed from training_step in manual_optimization, change
# opt_idx=0 to opt_idx=None in the signature here

# toggle model params
self._run_optimization_start(opt_idx, optimizer)

Expand Down Expand Up @@ -624,10 +626,10 @@ def _build_kwargs(self, batch: Any, batch_idx: int, opt_idx: int, hiddens: Optio
has_opt_idx_in_train_step = is_param_in_hook_signature(training_step_fx, "optimizer_idx")
if has_opt_idx_in_train_step:
if not lightning_module.automatic_optimization:
self._warning_cache.deprecation(
"`training_step` hook signature has changed in v1.3."
" `optimizer_idx` argument has been removed in case of manual optimization. Support for"
" the old signature will be removed in v1.5"
raise ValueError(
"Your `LightningModule.training_step` signature contains an `optimizer_idx` argument but"
" in manual optimization optimizers must be handled by the user. Remove the optimizer_idx"
" argument or set `self.automatic_optimization = True`."
)
step_kwargs["optimizer_idx"] = opt_idx
elif not has_opt_idx_in_train_step and lightning_module.automatic_optimization:
Expand Down
2 changes: 0 additions & 2 deletions pytorch_lightning/loops/epoch/training_epoch_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,6 @@ def __init__(self, min_steps: int, max_steps: int):
self.global_step: int = 0
# the total batch index across all epochs
self.total_batch_idx: int = 0
# the current split index when the batch gets split into chunks in truncated backprop through time
self.split_idx: Optional[int] = None
self.is_last_batch: Optional[bool] = None
self.batch_progress = Progress()
self.scheduler_progress = SchedulerProgress()
Expand Down
2 changes: 1 addition & 1 deletion pytorch_lightning/loops/fit_loop.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def batch_idx(self) -> int:
@property
def split_idx(self) -> int:
"""Returns the index of the current batch split (within the current batch) for bptt"""
return self.epoch_loop.split_idx
return self.epoch_loop.batch_loop.split_idx

@property
def min_steps(self) -> int:
Expand Down
4 changes: 2 additions & 2 deletions pytorch_lightning/trainer/connectors/accelerator_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,7 +748,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None):
self.distributed_backend = "ddp_spawn"

# special case with DDP on CPUs
if self.distributed_backend == "ddp_cpu":
if self.distributed_backend == DistributedType.DDP_CPU:
if _TPU_AVAILABLE:
raise MisconfigurationException(
"`accelerator='ddp_cpu'` is not supported on TPU machines. "
Expand Down Expand Up @@ -803,7 +803,7 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None):
self.num_processes = self.num_nodes

# Horovod is an extra case...
if self.distributed_backend == "horovod":
if self.distributed_backend == DistributedType.HOROVOD:
self._set_horovod_backend()

using_valid_distributed = self.use_ddp or self.use_ddp2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -241,9 +241,9 @@ def on_batch_start(self) -> None:
self._epoch_end_reached = False

def epoch_end_reached(self):
self.trainer.logger_connector._epoch_end_reached = True
self.trainer.logger_connector._batch_idx = None
self.trainer.logger_connector._split_idx = None
self._epoch_end_reached = True
self._batch_idx = None
self._split_idx = None

def on_epoch_end(self) -> None:
assert self._epoch_end_reached
Expand Down
34 changes: 0 additions & 34 deletions pytorch_lightning/trainer/logging.py

This file was deleted.

2 changes: 0 additions & 2 deletions pytorch_lightning/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@
from pytorch_lightning.trainer.connectors.training_trick_connector import TrainingTricksConnector
from pytorch_lightning.trainer.data_loading import TrainerDataLoadingMixin
from pytorch_lightning.trainer.deprecated_api import DeprecatedTrainerAttributes
from pytorch_lightning.trainer.logging import TrainerLoggingMixin
from pytorch_lightning.trainer.model_hooks import TrainerModelHooksMixin
from pytorch_lightning.trainer.optimizers import TrainerOptimizersMixin
from pytorch_lightning.trainer.properties import TrainerProperties
Expand Down Expand Up @@ -97,7 +96,6 @@ class Trainer(
TrainerCallbackHookMixin,
TrainerModelHooksMixin,
TrainerOptimizersMixin,
TrainerLoggingMixin,
TrainerTrainingTricksMixin,
TrainerDataLoadingMixin,
DeprecatedTrainerAttributes,
Expand Down
9 changes: 5 additions & 4 deletions pytorch_lightning/utilities/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def __hash__(self) -> int:
class AMPType(LightningEnum):
"""Type of Automatic Mixed Precission used for training.
>>> # you can math the type with string
>>> # you can match the type with string
>>> AMPType.APEX == 'apex'
True
"""
Expand All @@ -52,7 +52,7 @@ class AMPType(LightningEnum):
class DistributedType(LightningEnum):
"""Define type of ditributed computing.
>>> # you can math the type with string
>>> # you can match the type with string
>>> DistributedType.DDP == 'ddp'
True
>>> # which is case invariant
Expand All @@ -77,6 +77,7 @@ def is_interactive_compatible(self) -> bool:
DP = "dp"
DDP = "ddp"
DDP2 = "ddp2"
DDP_CPU = "ddp_cpu"
DDP_SPAWN = "ddp_spawn"
TPU_SPAWN = "tpu_spawn"
DEEPSPEED = "deepspeed"
Expand All @@ -87,11 +88,11 @@ def is_interactive_compatible(self) -> bool:


class DeviceType(LightningEnum):
"""Define Device type byt its nature - acceleatrors.
"""Define Device type by its nature - acceleatrors.
>>> DeviceType.CPU == DeviceType.from_str('cpu')
True
>>> # you can math the type with string
>>> # you can match the type with string
>>> DeviceType.GPU == 'GPU'
True
>>> # which is case invariant
Expand Down
2 changes: 1 addition & 1 deletion tests/callbacks/test_stochastic_weight_avg.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def test_swa_warns(tmpdir, caplog):
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, stochastic_weight_avg=True)
with caplog.at_level(level=logging.INFO), pytest.warns(UserWarning, match="SWA is currently only supported"):
trainer.fit(model)
assert "Swapping scheduler" in caplog.text
assert "Swapping scheduler `StepLR` for `SWALR`" in caplog.text


def test_swa_raises():
Expand Down
26 changes: 0 additions & 26 deletions tests/deprecated_api/test_remove_1-5.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,26 +166,6 @@ def test_v1_5_0_running_sanity_check():
assert not trainer.running_sanity_check


def test_old_training_step_signature_with_opt_idx_manual_opt(tmpdir):
class OldSignatureModel(BoringModel):
def __init__(self):
super().__init__()
self.automatic_optimization = False

def training_step(self, batch, batch_idx, optimizer_idx):
assert optimizer_idx == 0
return super().training_step(batch, batch_idx)

def configure_optimizers(self):
return [optim.SGD(self.parameters(), lr=1e-2), optim.SGD(self.parameters(), lr=1e-2)]

model = OldSignatureModel()
trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=2)

with pytest.deprecated_call(match="`training_step` .* `optimizer_idx` .* manual .* will be removed in v1.5"):
trainer.fit(model)


def test_v1_5_0_model_checkpoint_period(tmpdir):
with no_warning_call(DeprecationWarning):
ModelCheckpoint(dirpath=tmpdir)
Expand Down Expand Up @@ -247,12 +227,6 @@ def test_epoch_end(self, outputs):
trainer.test(model)


def test_v1_5_0_trainer_logging_mixin(tmpdir):
trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, checkpoint_callback=False, logger=False)
with pytest.deprecated_call(match="is deprecated in v1.3 and will be removed in v1.5"):
trainer.metrics_to_scalars({})


def test_v1_5_0_lighting_module_grad_norm(tmpdir):
model = BoringModel()
with pytest.deprecated_call(match="is deprecated in v1.3 and will be removed in v1.5"):
Expand Down
18 changes: 15 additions & 3 deletions tests/models/test_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,16 +213,22 @@ def get_members(cls):

class HookedCallback(Callback):
def __init__(self, called):
def call(hook, *args, **kwargs):
def call(hook, fn, *args, **kwargs):
out = fn(*args, **kwargs)
d = {"name": f"Callback.{hook}"}
if args:
d["args"] = args
if kwargs:
d["kwargs"] = kwargs
called.append(d)
return out

for h in get_members(Callback):
setattr(self, h, partial(call, h))
attr = getattr(self, h)
setattr(self, h, partial(call, h, attr))

def on_save_checkpoint(*args, **kwargs):
return {"foo": True}


class HookedModel(BoringModel):
Expand Down Expand Up @@ -555,7 +561,12 @@ def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir):
# initial training to get a checkpoint
model = BoringModel()
trainer = Trainer(
default_root_dir=tmpdir, max_steps=1, limit_val_batches=0, progress_bar_refresh_rate=0, weights_summary=None
default_root_dir=tmpdir,
max_steps=1,
limit_val_batches=0,
progress_bar_refresh_rate=0,
weights_summary=None,
callbacks=[HookedCallback([])],
)
trainer.fit(model)
best_model_path = trainer.checkpoint_callback.best_model_path
Expand Down Expand Up @@ -611,6 +622,7 @@ def test_trainer_model_hook_system_fit_no_val_and_resume(tmpdir):
},
),
),
dict(name="Callback.on_load_checkpoint", args=(trainer, model, {"foo": True})),
dict(name="configure_sharded_model"),
dict(name="Callback.on_configure_sharded_model", args=(trainer, model)),
dict(name="configure_optimizers"),
Expand Down
Loading

0 comments on commit a1542b3

Please sign in to comment.