From 2505ccc411f3eae536f2334eb25e47a74cfdbc9d Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Thu, 18 Mar 2021 11:38:59 -0700 Subject: [PATCH 1/9] bug fix --- pytorch_lightning/plugins/training_type/horovod.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index 9f1bafe309f89..7745c6a31d8fd 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -150,7 +150,7 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ if reduce_op in (None, "avg", "mean"): reduce_op = hvd.Average - elif reduce_op == "sum": + elif reduce_op == "sum" or reduce_op == ReduceOp.SUM: reduce_op = hvd.Sum else: raise ValueError(f"unrecognized `reduce_op`: {reduce_op}") From 6ccfd68f7956c83ec52bc4fb7c157783845a3b1c Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Fri, 19 Mar 2021 15:21:58 -0700 Subject: [PATCH 2/9] Update pytorch_lightning/plugins/training_type/horovod.py Co-authored-by: Jirka Borovec --- pytorch_lightning/plugins/training_type/horovod.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index 7745c6a31d8fd..43f526202d207 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -150,7 +150,7 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ if reduce_op in (None, "avg", "mean"): reduce_op = hvd.Average - elif reduce_op == "sum" or reduce_op == ReduceOp.SUM: + elif reduce_op in ("sum", ReduceOp.SUM): reduce_op = hvd.Sum else: raise ValueError(f"unrecognized `reduce_op`: {reduce_op}") From e9d9c0817890a7a97e67b61fb81fb42561224ec3 Mon Sep 17 00:00:00 2001 From: Amog Kamsetty Date: Fri, 19 Mar 2021 15:32:49 -0700 Subject: [PATCH 3/9] try add test --- .../plugins/training_type/horovod.py | 2 +- .../test_checkpoint_callback_frequency.py | 2 +- tests/models/test_horovod.py | 43 ++++++++++++++++++- 3 files changed, 44 insertions(+), 3 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/horovod.py b/pytorch_lightning/plugins/training_type/horovod.py index 7745c6a31d8fd..9f1bafe309f89 100644 --- a/pytorch_lightning/plugins/training_type/horovod.py +++ b/pytorch_lightning/plugins/training_type/horovod.py @@ -150,7 +150,7 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ if reduce_op in (None, "avg", "mean"): reduce_op = hvd.Average - elif reduce_op == "sum" or reduce_op == ReduceOp.SUM: + elif reduce_op == "sum": reduce_op = hvd.Sum else: raise ValueError(f"unrecognized `reduce_op`: {reduce_op}") diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 7926bc46dd290..6fa99b46c7bdb 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -117,7 +117,7 @@ def training_step(self, batch, batch_idx): self.log('my_loss', batch_idx * (1 + local_rank), on_epoch=True) return super().training_step(batch, batch_idx) - def training_epoch_end(self, outputs) -> None: + def training_epoch_end(self, outputsƒ) -> None: data = str(self.global_rank) obj = [[data], (data, ), set(data)] out = self.trainer.training_type_plugin.broadcast(obj) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 3c8c9b0f36041..31c9274fa16bf 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -16,6 +16,7 @@ import shlex import subprocess import sys +from unittest import mock from unittest.mock import patch import numpy as np @@ -26,7 +27,7 @@ import tests.helpers.pipelines as tpipes import tests.helpers.utils as tutils -from pytorch_lightning import Trainer +from pytorch_lightning import Trainer, callbacks from pytorch_lightning.accelerators import CPUAccelerator from pytorch_lightning.metrics.classification.accuracy import Accuracy from pytorch_lightning.trainer.states import TrainerState @@ -376,3 +377,43 @@ def configure_optimizers(self): # Called every 3 steps, meaning for 1 epoch of 11 batches, it is called 3 times with gamma=0.1 assert pytest.approx(init_lr * 0.1) == adjusted_lr2 + +@mock.patch('torch.save') +@RunIf(special=True, horovod=True, skip_windows=True, min_gpus=2) +@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1), (2, 2, 0.3, 5)]) +def test_top_k_horovod(save_mock, tmpdir, k, epochs, val_check_interval, + expected): + """This test mirrors test_checkpoint_callback_frequency::test_top_k_ddp""" + + class TestModel(BoringModel): + + def training_step(self, batch, batch_idx): + local_rank = int(os.getenv("LOCAL_RANK")) + self.log('my_loss', batch_idx * (1 + local_rank), on_epoch=True) + return super().training_step(batch, batch_idx) + + def training_epoch_end(self, outputs) -> None: + data = str(self.global_rank) + obj = [[data], (data, ), set(data)] + out = self.trainer.training_type_plugin.broadcast(obj) + assert obj == [[str(self.global_rank)], (str(self.global_rank), ), set(str(self.global_rank))] + assert out == [['0'], ('0', ), set('0')] + + model = TestModel() + trainer = Trainer( + callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor='my_loss_step', save_top_k=k, mode="max")], + default_root_dir=tmpdir, + max_epochs=epochs, + weights_summary=None, + val_check_interval=val_check_interval, + accelerator="horovod", + gpus=2, + limit_train_batches=64, + limit_val_batches=32, + ) + if os.getenv("LOCAL_RANK") == "0": + with pytest.raises(UserWarning, match="The value associated to the key my_loss_epoch: [15.5, 31.0]"): + trainer.fit(model) + assert save_mock.call_count == expected + else: + trainer.fit(model) From d1790cd7efa61c38ea426410c4e6b6ac7cc0e216 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 25 Mar 2021 23:12:48 +0100 Subject: [PATCH 4/9] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- tests/checkpointing/test_checkpoint_callback_frequency.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 6fa99b46c7bdb..7926bc46dd290 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -117,7 +117,7 @@ def training_step(self, batch, batch_idx): self.log('my_loss', batch_idx * (1 + local_rank), on_epoch=True) return super().training_step(batch, batch_idx) - def training_epoch_end(self, outputsƒ) -> None: + def training_epoch_end(self, outputs) -> None: data = str(self.global_rank) obj = [[data], (data, ), set(data)] out = self.trainer.training_type_plugin.broadcast(obj) From 6b9c62fdcef695e3561749a2a3718f011eb67d34 Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 25 Mar 2021 23:38:34 +0100 Subject: [PATCH 5/9] Reuse code --- .../test_checkpoint_callback_frequency.py | 18 ++++---- tests/models/test_horovod.py | 43 +------------------ tests/special_tests.sh | 2 +- 3 files changed, 12 insertions(+), 51 deletions(-) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 7926bc46dd290..3c3fc1fd4ab50 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -17,7 +17,8 @@ import pytest import torch -from pytorch_lightning import callbacks, seed_everything, Trainer +from pytorch_lightning import seed_everything, Trainer +from pytorch_lightning.callbacks import ModelCheckpoint from tests.helpers import BoringModel from tests.helpers.runif import RunIf @@ -93,7 +94,7 @@ def training_step(self, batch, batch_idx): model = TestModel() trainer = Trainer( - callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor='my_loss', save_top_k=k)], + callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='my_loss', save_top_k=k)], default_root_dir=tmpdir, max_epochs=epochs, weights_summary=None, @@ -107,8 +108,9 @@ def training_step(self, batch, batch_idx): @mock.patch('torch.save') @RunIf(special=True, min_gpus=2) +@pytest.mark.parametrize("accelerator", ["ddp", pytest.param("horovod", marks=RunIf(horovod=True, skip_windows=True))]) @pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1), (2, 2, 0.3, 5)]) -def test_top_k_ddp(save_mock, tmpdir, k, epochs, val_check_interval, expected): +def test_top_k(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): @@ -117,16 +119,16 @@ def training_step(self, batch, batch_idx): self.log('my_loss', batch_idx * (1 + local_rank), on_epoch=True) return super().training_step(batch, batch_idx) - def training_epoch_end(self, outputs) -> None: + def training_epoch_end(self, outputs): data = str(self.global_rank) - obj = [[data], (data, ), set(data)] + obj = [[data], (data, ), {data}] out = self.trainer.training_type_plugin.broadcast(obj) - assert obj == [[str(self.global_rank)], (str(self.global_rank), ), set(str(self.global_rank))] - assert out == [['0'], ('0', ), set('0')] + assert obj == [[str(self.global_rank)], (str(self.global_rank), ), {str(self.global_rank)}] + assert out == [['0'], ('0', ), {'0'}] model = TestModel() trainer = Trainer( - callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor='my_loss_step', save_top_k=k, mode="max")], + callbacks=[ModelCheckpoint(dirpath=tmpdir, monitor='my_loss_step', save_top_k=k, mode="max")], default_root_dir=tmpdir, max_epochs=epochs, weights_summary=None, diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 31c9274fa16bf..3c8c9b0f36041 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -16,7 +16,6 @@ import shlex import subprocess import sys -from unittest import mock from unittest.mock import patch import numpy as np @@ -27,7 +26,7 @@ import tests.helpers.pipelines as tpipes import tests.helpers.utils as tutils -from pytorch_lightning import Trainer, callbacks +from pytorch_lightning import Trainer from pytorch_lightning.accelerators import CPUAccelerator from pytorch_lightning.metrics.classification.accuracy import Accuracy from pytorch_lightning.trainer.states import TrainerState @@ -377,43 +376,3 @@ def configure_optimizers(self): # Called every 3 steps, meaning for 1 epoch of 11 batches, it is called 3 times with gamma=0.1 assert pytest.approx(init_lr * 0.1) == adjusted_lr2 - -@mock.patch('torch.save') -@RunIf(special=True, horovod=True, skip_windows=True, min_gpus=2) -@pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1), (2, 2, 0.3, 5)]) -def test_top_k_horovod(save_mock, tmpdir, k, epochs, val_check_interval, - expected): - """This test mirrors test_checkpoint_callback_frequency::test_top_k_ddp""" - - class TestModel(BoringModel): - - def training_step(self, batch, batch_idx): - local_rank = int(os.getenv("LOCAL_RANK")) - self.log('my_loss', batch_idx * (1 + local_rank), on_epoch=True) - return super().training_step(batch, batch_idx) - - def training_epoch_end(self, outputs) -> None: - data = str(self.global_rank) - obj = [[data], (data, ), set(data)] - out = self.trainer.training_type_plugin.broadcast(obj) - assert obj == [[str(self.global_rank)], (str(self.global_rank), ), set(str(self.global_rank))] - assert out == [['0'], ('0', ), set('0')] - - model = TestModel() - trainer = Trainer( - callbacks=[callbacks.ModelCheckpoint(dirpath=tmpdir, monitor='my_loss_step', save_top_k=k, mode="max")], - default_root_dir=tmpdir, - max_epochs=epochs, - weights_summary=None, - val_check_interval=val_check_interval, - accelerator="horovod", - gpus=2, - limit_train_batches=64, - limit_val_batches=32, - ) - if os.getenv("LOCAL_RANK") == "0": - with pytest.raises(UserWarning, match="The value associated to the key my_loss_epoch: [15.5, 31.0]"): - trainer.fit(model) - assert save_mock.call_count == expected - else: - trainer.fit(model) diff --git a/tests/special_tests.sh b/tests/special_tests.sh index 43658721e9226..20576b1e74d79 100644 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -32,5 +32,5 @@ python ${DEFAULTS} tests/trainer/test_trainer.py::test_pytorch_profiler_trainer_ python ${DEFAULTS} tests/models/test_hooks.py::test_transfer_batch_hook_ddp python ${DEFAULTS} tests/trainer/test_data_loading.py::test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model -python ${DEFAULTS} tests/checkpointing/test_checkpoint_callback_frequency.py::test_top_k_ddp +python ${DEFAULTS} tests/checkpointing/test_checkpoint_callback_frequency.py::test_top_k nvprof --profile-from-start off -o trace_name.prof -- python ${DEFAULTS} tests/trainer/test_trainer.py::test_pytorch_profiler_nested_emit_nvtx From 1b0507446a0320c0cfc9d31865d215ea9cb6e87d Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 25 Mar 2021 23:41:14 +0100 Subject: [PATCH 6/9] Avoid name conflict --- tests/checkpointing/test_checkpoint_callback_frequency.py | 2 +- tests/special_tests.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 3c3fc1fd4ab50..47f8a4972e0f3 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -110,7 +110,7 @@ def training_step(self, batch, batch_idx): @RunIf(special=True, min_gpus=2) @pytest.mark.parametrize("accelerator", ["ddp", pytest.param("horovod", marks=RunIf(horovod=True, skip_windows=True))]) @pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1), (2, 2, 0.3, 5)]) -def test_top_k(save_mock, tmpdir, k, epochs, val_check_interval, expected): +def test_top_k_distributed(save_mock, tmpdir, k, epochs, val_check_interval, expected): class TestModel(BoringModel): diff --git a/tests/special_tests.sh b/tests/special_tests.sh index c174ceae56931..7c117d10adede 100644 --- a/tests/special_tests.sh +++ b/tests/special_tests.sh @@ -38,5 +38,5 @@ python ${DEFAULTS} tests/test_profiler.py::test_pytorch_profiler_trainer_ddp python ${DEFAULTS} tests/models/test_hooks.py::test_transfer_batch_hook_ddp python ${DEFAULTS} tests/trainer/test_data_loading.py::test_replace_distrubuted_sampler_custom_dataloader_custom_batch_sampler python ${DEFAULTS} tests/trainer/optimization/test_manual_optimization.py::test_step_with_optimizer_closure_with_different_frequencies_ddp_with_toggle_model -python ${DEFAULTS} tests/checkpointing/test_checkpoint_callback_frequency.py::test_top_k +python ${DEFAULTS} tests/checkpointing/test_checkpoint_callback_frequency.py::test_top_k_distributed nvprof --profile-from-start off -o trace_name.prof -- python ${DEFAULTS} tests/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx From 29fc2e8b522140f5af48d42754a3b3cfcd10c8df Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Thu, 25 Mar 2021 23:50:39 +0100 Subject: [PATCH 7/9] Forgot argument --- tests/checkpointing/test_checkpoint_callback_frequency.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/checkpointing/test_checkpoint_callback_frequency.py b/tests/checkpointing/test_checkpoint_callback_frequency.py index 47f8a4972e0f3..fe24fe8d55299 100644 --- a/tests/checkpointing/test_checkpoint_callback_frequency.py +++ b/tests/checkpointing/test_checkpoint_callback_frequency.py @@ -110,7 +110,7 @@ def training_step(self, batch, batch_idx): @RunIf(special=True, min_gpus=2) @pytest.mark.parametrize("accelerator", ["ddp", pytest.param("horovod", marks=RunIf(horovod=True, skip_windows=True))]) @pytest.mark.parametrize(['k', 'epochs', 'val_check_interval', 'expected'], [(1, 1, 1.0, 1), (2, 2, 0.3, 5)]) -def test_top_k_distributed(save_mock, tmpdir, k, epochs, val_check_interval, expected): +def test_top_k_distributed(save_mock, tmpdir, accelerator, k, epochs, val_check_interval, expected): class TestModel(BoringModel): @@ -133,7 +133,7 @@ def training_epoch_end(self, outputs): max_epochs=epochs, weights_summary=None, val_check_interval=val_check_interval, - accelerator="ddp", + accelerator=accelerator, gpus=2, limit_train_batches=64, limit_val_batches=32, From cae141933a87d072277b6a79a884f715f7a0e94b Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Fri, 26 Mar 2021 00:46:45 +0100 Subject: [PATCH 8/9] Minor changes --- tests/models/test_horovod.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index 3c8c9b0f36041..16df1ce6ba934 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -49,9 +49,9 @@ def _run_horovod(trainer_options, on_gpu=False): # for Horovod, we interpret `gpus` to be set per worker trainer_options.update(gpus=1 if on_gpu else None) tutils.reset_seed() - # todo: Find why coverage breaks CI. - # append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else '' # noqa E265 - # str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append, # noqa E265 + # TODO: find why coverage breaks CI + # append = '-a' if '.coverage' in os.listdir(_PROJECT_ROOT) else '' + # str(num_processes), sys.executable, '-m', 'coverage', 'run', '--source', 'pytorch_lightning', append, cmdline = [ 'horovodrun', '-np', str(num_processes), sys.executable, TEST_SCRIPT, '--trainer-options', @@ -235,7 +235,7 @@ def get_optimizer_params(optimizer): assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1]) -@pytest.mark.skipif(reason="CI agent.jobstatus=Succeeded: Permission denied") +@pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied") @RunIf(skip_windows=True, horovod=True) def test_result_reduce_horovod(tmpdir): """Make sure result logging works with Horovod. @@ -285,7 +285,7 @@ def training_epoch_end(self, outputs) -> None: horovod.run(hvd_test_fn, np=2) -@pytest.mark.skipif(reason="CI agent.jobstatus=Succeeded: Permission denied") +@pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied") @RunIf(skip_windows=True, horovod=True, num_gpus=2) def test_accuracy_metric_horovod(): num_batches = 10 From d22297db16c34a241f938f8c5d900e4d967bd53b Mon Sep 17 00:00:00 2001 From: Carlos Mocholi Date: Mon, 19 Apr 2021 12:27:43 +0200 Subject: [PATCH 9/9] Bad merge --- tests/models/test_horovod.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/models/test_horovod.py b/tests/models/test_horovod.py index bc3eb10a57d82..04004a7dcd647 100644 --- a/tests/models/test_horovod.py +++ b/tests/models/test_horovod.py @@ -327,7 +327,9 @@ def training_epoch_end(self, outputs) -> None: # todo: need to be fixed :] +@pytest.mark.skip(reason="TODO: CI agent.jobstatus=Succeeded: Permission denied") @RunIf(skip_windows=True, horovod=True, num_gpus=2) +def test_accuracy_metric_horovod(): num_batches = 10 batch_size = 16 threshold = 0.5