From 3014c1c71388204451024827fb8cef5885e04fe4 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 3 Mar 2021 15:52:08 +0000 Subject: [PATCH 01/12] fix --- pytorch_lightning/plugins/training_type/dp.py | 4 ++++ tests/accelerators/test_dp.py | 23 +++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index c2b16303e5d4e..9ff7909a5a209 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -19,6 +19,7 @@ from pytorch_lightning.core.step_result import Result from pytorch_lightning.overrides.data_parallel import LightningParallelModule from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.utilities.apply_func import apply_to_collection class DataParallelPlugin(ParallelPlugin): @@ -49,6 +50,9 @@ def reduce(self, tensor, *args, **kwargs): elif isinstance(tensor, torch.Tensor): tensor = tensor.mean() + elif isinstance(tensor, dict): + tensor = apply_to_collection(tensor, torch.Tensor, torch.mean) + return tensor @property diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index 8aeb687f1c927..35de70c21a607 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -123,3 +123,26 @@ def test_dp_test(tmpdir): new_weights = model.layer_0.weight.clone().detach().cpu() assert torch.all(torch.eq(old_weights, new_weights)) + + +@RunIf(min_gpus=2) +def test_dp_training_step_dict(tmpdir): + """ + This test verify dp properly reduce dictionaries + """ + class TestModel(BoringModel): + + def training_step(self, batch, batch_idx): + return super().training_step(batch, batch_idx) + + model = TestModel() + model.training_step_end = None + trainer = pl.Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=2, + limit_val_batches=0, + gpus=2, + plugins='dp', + ) + trainer.fit(model) From a07687f49f5066a8cabb25a9a87218c6fb6df515 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 3 Mar 2021 15:52:08 +0000 Subject: [PATCH 02/12] fix --- pytorch_lightning/plugins/training_type/dp.py | 4 ++++ tests/accelerators/test_dp.py | 23 +++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index c2b16303e5d4e..9ff7909a5a209 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -19,6 +19,7 @@ from pytorch_lightning.core.step_result import Result from pytorch_lightning.overrides.data_parallel import LightningParallelModule from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin +from pytorch_lightning.utilities.apply_func import apply_to_collection class DataParallelPlugin(ParallelPlugin): @@ -49,6 +50,9 @@ def reduce(self, tensor, *args, **kwargs): elif isinstance(tensor, torch.Tensor): tensor = tensor.mean() + elif isinstance(tensor, dict): + tensor = apply_to_collection(tensor, torch.Tensor, torch.mean) + return tensor @property diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index 8aeb687f1c927..35de70c21a607 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -123,3 +123,26 @@ def test_dp_test(tmpdir): new_weights = model.layer_0.weight.clone().detach().cpu() assert torch.all(torch.eq(old_weights, new_weights)) + + +@RunIf(min_gpus=2) +def test_dp_training_step_dict(tmpdir): + """ + This test verify dp properly reduce dictionaries + """ + class TestModel(BoringModel): + + def training_step(self, batch, batch_idx): + return super().training_step(batch, batch_idx) + + model = TestModel() + model.training_step_end = None + trainer = pl.Trainer( + default_root_dir=tmpdir, + max_epochs=1, + limit_train_batches=2, + limit_val_batches=0, + gpus=2, + plugins='dp', + ) + trainer.fit(model) From dd46e5b3225fc6b617a666c2ff21f2e4cdf4f905 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 3 Mar 2021 15:55:04 +0000 Subject: [PATCH 03/12] update --- pytorch_lightning/accelerators/tpu.py | 6 ++++-- tests/accelerators/test_dp.py | 3 ++- tests/callbacks/test_pruning.py | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py index c36f7287f3d8e..97e960523895c 100644 --- a/pytorch_lightning/accelerators/tpu.py +++ b/pytorch_lightning/accelerators/tpu.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Optional, TYPE_CHECKING +from typing import Any, Callable, Mapping, Optional, TYPE_CHECKING import torch from torch.optim import Optimizer @@ -31,7 +31,9 @@ def setup(self, trainer: 'Trainer', model: 'LightningModule') -> None: raise MisconfigurationException("TPUs only support a single tpu core or tpu spawn training.") return super().setup(trainer, model) - def run_optimizer_step(self, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs): + def run_optimizer_step( + self, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs + ) -> None: xm.optimizer_step(optimizer, barrier=False, optimizer_args={'closure': lambda_closure, **kwargs}) def all_gather(self, tensor: torch.Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> torch.Tensor: diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index 35de70c21a607..fe40921b5e051 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -128,8 +128,9 @@ def test_dp_test(tmpdir): @RunIf(min_gpus=2) def test_dp_training_step_dict(tmpdir): """ - This test verify dp properly reduce dictionaries + This test verify dp properly reduce dictionaries """ + class TestModel(BoringModel): def training_step(self, batch, batch_idx): diff --git a/tests/callbacks/test_pruning.py b/tests/callbacks/test_pruning.py index 23b2fcbb52235..0e63fc29d49b1 100644 --- a/tests/callbacks/test_pruning.py +++ b/tests/callbacks/test_pruning.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os from collections import OrderedDict from logging import INFO @@ -22,7 +21,7 @@ from torch.nn import Sequential from pytorch_lightning import seed_everything, Trainer -from pytorch_lightning.callbacks import ModelPruning, ModelCheckpoint +from pytorch_lightning.callbacks import ModelCheckpoint, ModelPruning from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests.helpers import BoringModel from tests.helpers.runif import RunIf @@ -274,6 +273,7 @@ def test_permanent_when_model_is_saved_multiple_times(tmpdir, caplog): seed_everything(0) class TestPruning(ModelPruning): + def on_save_checkpoint(self, trainer, pl_module, checkpoint): super().on_save_checkpoint(trainer, pl_module, checkpoint) assert "layer.mlp_3.weight_orig" not in checkpoint["state_dict"] From c4f75ce1d46484982244ceeef220e190e9ae0da7 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 3 Mar 2021 15:55:34 +0000 Subject: [PATCH 04/12] update --- pytorch_lightning/accelerators/tpu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py index 97e960523895c..2d7304e42b3c8 100644 --- a/pytorch_lightning/accelerators/tpu.py +++ b/pytorch_lightning/accelerators/tpu.py @@ -1,4 +1,4 @@ -from typing import Any, Callable, Mapping, Optional, TYPE_CHECKING +from typing import Any, Callable, Optional, TYPE_CHECKING import torch from torch.optim import Optimizer From 04c0f7390ffad9a5887f85da65185e9d6a28a006 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 3 Mar 2021 15:59:02 +0000 Subject: [PATCH 05/12] add changelog --- CHANGELOG.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 27e5f4be2d04a..d7690d59567e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [UnReleased] - 2021-MM-DD +## [1.2.3] - 2021-MM-DD + +### Fixed + + +- Fixed DP reduction with collection ([#6324](https://github.com/PyTorchLightning/pytorch-lightning/pull/6324)) + + +## [1.2.2] - 2021-03-03 ### Added @@ -101,6 +109,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297) +- Added `checkpoint` parameter to callback's `on_save_checkpoint` hook ([#6072](https://github.com/PyTorchLightning/pytorch-lightning/pull/6072)) + ## [1.2.1] - 2021-02-23 ### Fixed From 56acaf171d1b1dca2a2ac853073af3519ef63e52 Mon Sep 17 00:00:00 2001 From: tchaton Date: Wed, 3 Mar 2021 17:08:29 +0000 Subject: [PATCH 06/12] update changelog --- CHANGELOG.md | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d7690d59567e9..db1f3970e0e6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,15 +5,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.2.3] - 2021-MM-DD - -### Fixed - - -- Fixed DP reduction with collection ([#6324](https://github.com/PyTorchLightning/pytorch-lightning/pull/6324)) - - -## [1.2.2] - 2021-03-03 +## [UnReleased] - 2021-MM-DD ### Added @@ -109,7 +101,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297) -- Added `checkpoint` parameter to callback's `on_save_checkpoint` hook ([#6072](https://github.com/PyTorchLightning/pytorch-lightning/pull/6072)) +- Fixed DP reduction with collection ([#6324](https://github.com/PyTorchLightning/pytorch-lightning/pull/6324)) + ## [1.2.1] - 2021-02-23 From 6b3d47ec0a6f19fc6f31e8654456975c9825b62a Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 3 Mar 2021 17:06:54 +0000 Subject: [PATCH 07/12] Update tests/accelerators/test_dp.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos MocholĂ­ --- tests/accelerators/test_dp.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index fe40921b5e051..a094cfd380cc6 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -130,13 +130,7 @@ def test_dp_training_step_dict(tmpdir): """ This test verify dp properly reduce dictionaries """ - - class TestModel(BoringModel): - - def training_step(self, batch, batch_idx): - return super().training_step(batch, batch_idx) - - model = TestModel() + model = BoringModel() model.training_step_end = None trainer = pl.Trainer( default_root_dir=tmpdir, From 9b1e45f58bc0510b845fe3f98f67e8d9ef087f91 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 12:57:44 +0000 Subject: [PATCH 08/12] Update tests/accelerators/test_dp.py Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> --- tests/accelerators/test_dp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index a094cfd380cc6..1d79ff8c59eba 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -138,6 +138,6 @@ def test_dp_training_step_dict(tmpdir): limit_train_batches=2, limit_val_batches=0, gpus=2, - plugins='dp', + accelerator='dp', ) trainer.fit(model) From 8070c12303c90f536504d39bace655403750ecf6 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 22:44:01 +0000 Subject: [PATCH 09/12] resolve reduce --- pytorch_lightning/plugins/training_type/dp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index 9ff7909a5a209..a3a551573109a 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -47,11 +47,11 @@ def reduce(self, tensor, *args, **kwargs): if isinstance(tensor, Result): tensor.dp_reduce() - elif isinstance(tensor, torch.Tensor): - tensor = tensor.mean() + else: + def _reduce(tensor: torch.Tensor): + return tensor.float().mean() - elif isinstance(tensor, dict): - tensor = apply_to_collection(tensor, torch.Tensor, torch.mean) + tensor = apply_to_collection(tensor, torch.Tensor, _reduce) return tensor From b9b836c36804899e4748cbb8dc8ca20ca8490b5a Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 22:45:55 +0000 Subject: [PATCH 10/12] update --- pytorch_lightning/plugins/training_type/dp.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index a3a551573109a..076fafd778f7b 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -49,7 +49,8 @@ def reduce(self, tensor, *args, **kwargs): else: def _reduce(tensor: torch.Tensor): - return tensor.float().mean() + dtype_tensor = tensor.dtype + return tensor.float().mean().type(dtype_tensor) tensor = apply_to_collection(tensor, torch.Tensor, _reduce) From af0a85469dc7823494ffce5ba63757da091c0587 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 4 Mar 2021 22:47:41 +0000 Subject: [PATCH 11/12] resolve merge --- pytorch_lightning/plugins/training_type/dp.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index e91a2730d3e91..076fafd778f7b 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -54,9 +54,6 @@ def _reduce(tensor: torch.Tensor): tensor = apply_to_collection(tensor, torch.Tensor, _reduce) - elif isinstance(tensor, dict): - tensor = apply_to_collection(tensor, torch.Tensor, torch.mean) - return tensor @property From b79156e0feec73f0da861ded63c6e037c6756e0e Mon Sep 17 00:00:00 2001 From: tchaton Date: Thu, 4 Mar 2021 22:49:24 +0000 Subject: [PATCH 12/12] update --- pytorch_lightning/plugins/training_type/dp.py | 3 ++- tests/accelerators/test_dp.py | 8 ++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/pytorch_lightning/plugins/training_type/dp.py b/pytorch_lightning/plugins/training_type/dp.py index 076fafd778f7b..af8cfa7755974 100644 --- a/pytorch_lightning/plugins/training_type/dp.py +++ b/pytorch_lightning/plugins/training_type/dp.py @@ -48,8 +48,9 @@ def reduce(self, tensor, *args, **kwargs): tensor.dp_reduce() else: + def _reduce(tensor: torch.Tensor): - dtype_tensor = tensor.dtype + dtype_tensor = tensor.dtype return tensor.float().mean().type(dtype_tensor) tensor = apply_to_collection(tensor, torch.Tensor, _reduce) diff --git a/tests/accelerators/test_dp.py b/tests/accelerators/test_dp.py index f8608b184b95c..4736c6788c208 100644 --- a/tests/accelerators/test_dp.py +++ b/tests/accelerators/test_dp.py @@ -128,14 +128,10 @@ def test_dp_test(tmpdir): @RunIf(min_gpus=2) def test_dp_training_step_dict(tmpdir): """ - This test verify dp properly reduce dictionaries + This test verify dp properly reduce dictionaries """ - class TestModel(BoringModel): - def training_step(self, batch, batch_idx): - return super().training_step(batch, batch_idx) - - model = TestModel() + model = BoringModel() model.training_step_end = None trainer = pl.Trainer( default_root_dir=tmpdir,