Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Horovod] Fix Reduce for Horovod #6585

Closed
wants to merge 12 commits into from
2 changes: 1 addition & 1 deletion pytorch_lightning/plugins/training_type/horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[

if reduce_op in (None, "avg", "mean"):
reduce_op = hvd.Average
elif reduce_op == "sum":
elif reduce_op in ("sum", ReduceOp.SUM):
reduce_op = hvd.Sum
else:
raise ValueError(f"unrecognized `reduce_op`: {reduce_op}")
Expand Down
18 changes: 10 additions & 8 deletions tests/checkpointing/test_checkpoint_callback_frequency.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@
import pytest
import torch

from pytorch_lightning import callbacks, seed_everything, Trainer
from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from tests.helpers import BoringModel
from tests.helpers.runif import RunIf

Expand Down Expand Up @@ -93,7 +94,7 @@ def training_step(self, batch, batch_idx):

model = TestModel()
trainer = Trainer(
callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor='my_loss', save_top_k=k)],
callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='my_loss', save_top_k=k)],
default_root_dir=tmpdir,
max_epochs=epochs,
weights_summary=None,
Expand All @@ -107,8 +108,9 @@ def training_step(self, batch, batch_idx):

@mock.patch('torch.save')
@RunIf(special=True, min_gpus=2)
@pytest.mark.parametrize("accelerator", ["ddp", pytest.param("horovod", marks=RunIf(horovod=True, skip_windows=True))])
@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1), (2, 2, 0.3, 5)])
def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected):
def test_top_k_distributed(save_mock, tmpdir, k, epochs, val_check_interval, expected):

class TestModel(BoringModel):

Expand All @@ -117,16 +119,16 @@ def training_step(self, batch, batch_idx):
self.log('my_loss', batch_idx * (1 + local_rank), on_epoch=True)
return super().training_step(batch, batch_idx)

def training_epoch_end(self, outputs) -> None:
def training_epoch_end(self, outputs):
data = str(self.global_rank)
obj = [[data], (data, ), set(data)]
obj = [[data], (data, ), {data}]
out = self.trainer.training_type_plugin.broadcast(obj)
assert obj == [[str(self.global_rank)], (str(self.global_rank), ), set(str(self.global_rank))]
assert out == [['0'], ('0', ), set('0')]
assert obj == [[str(self.global_rank)], (str(self.global_rank), ), {str(self.global_rank)}]
assert out == [['0'], ('0', ), {'0'}]

model = TestModel()
trainer = Trainer(
callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor='my_loss_step', save_top_k=k, mode="max")],
callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='my_loss_step', save_top_k=k, mode="max")],
default_root_dir=tmpdir,
max_epochs=epochs,
weights_summary=None,
Expand Down
2 changes: 1 addition & 1 deletion tests/special_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,5 +38,5 @@ python ${DEFAULTS} tests/test_profiler.py::test_pytorch_profiler_trainer_ddp
python ${DEFAULTS} tests/models/test_hooks.py::test_transfer_batch_hook_ddp
python ${DEFAULTS} tests/trainer/test_data_loading.py::test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler
python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model
python ${DEFAULTS} tests/checkpointing/test_checkpoint_callback_frequency.py::test_top_k_ddp
python ${DEFAULTS} tests/checkpointing/test_checkpoint_callback_frequency.py::test_top_k_distributed
nvprof --profile-from-start off -o trace_name.prof -- python ${DEFAULTS} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx