Lightning-AI · tchaton · Jan 9, 2021 · Dec 21, 2020 · Dec 21, 2020 · Dec 21, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -30,6 +30,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Added `R2Score` metric ([#5241](https://github.com/PyTorchLightning/pytorch-lightning/pull/5241))
 
 
+- Accelerator `all_gather` supports collection ([#5221](https://github.com/PyTorchLightning/pytorch-lightning/pull/5221))
+
+
 ### Changed
 
 - `stat_scores` metric now calculates stat scores over all classes and gains new parameters, in line with the new `StatScores` metric ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839))

diff --git a/pytorch_lightning/core/lightning.py b/pytorch_lightning/core/lightning.py
@@ -14,15 +14,16 @@
 
 """nn.Module with additional great features."""
 
+from abc import ABC
+from argparse import Namespace
 import collections
 import copy
+from functools import partial
 import inspect
 import os
+from pathlib import Path
 import re
 import tempfile
-from abc import ABC
-from argparse import Namespace
-from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
@@ -35,10 +36,12 @@
 from pytorch_lightning.core.hooks import CheckpointHooks, DataHooks, ModelHooks
 from pytorch_lightning.core.memory import ModelSummary
 from pytorch_lightning.core.optimizer import LightningOptimizer
-from pytorch_lightning.core.saving import ALLOWED_CONFIG_TYPES, PRIMITIVE_TYPES, ModelIO
+from pytorch_lightning.core.saving import ALLOWED_CONFIG_TYPES, ModelIO, PRIMITIVE_TYPES
 from pytorch_lightning.core.step_result import Result
 from pytorch_lightning.utilities import rank_zero_warn
+from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors
 from pytorch_lightning.utilities.device_dtype_mixin import DeviceDtypeModuleMixin
+from pytorch_lightning.utilities.distributed import all_gather_ddp_if_available
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.parsing import AttributeDict, collect_init_args, get_init_args
 
@@ -364,7 +367,12 @@ def __auto_choose_log_on_epoch(self, on_epoch):
 
         return on_epoch
 
-    def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False):
+    def all_gather(
+        self,
+        data: Union[torch.Tensor, Dict, List, Tuple],
+        group: Optional[Any] = None,
+        sync_grads: bool = False,
+    ):
         r"""
         Allows users to call ``self.all_gather()`` from the LightningModule, thus making
         the ```all_gather``` operation accelerator agnostic.
@@ -373,14 +381,23 @@ def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, s
         distributed processes
 
         Args:
-            tensor: tensor of shape (batch, ...)
+            tensor: int, float, tensor of shape (batch, ...),  or a collection of
+                int, float, tensor of shape (batch, ...)
             group: the process group to gather results from. Defaults to all processes (world)
             sync_grads: flag that allows users to synchronize gradients for all_gather op
 
         Return:
-            A tensor of shape (world_size, batch, ...)
+            A tensor of shape (world_size, batch, ...), or if the input was a collection
+            the output will also be a collection with tensors of this shape.
         """
-        return self.trainer.accelerator_backend.all_gather(tensor, group=group, sync_grads=sync_grads)
+        if self.trainer.accelerator_backend is not None:
+            all_gather = self.trainer.accelerator_backend.all_gather
+        else:
+            all_gather = all_gather_ddp_if_available
+
+        data = convert_to_tensors(data, device=self.device)
+        all_gather = partial(all_gather, group=group, sync_grads=sync_grads)
+        return apply_to_collection(data, torch.Tensor, all_gather)
 
     def forward(self, *args, **kwargs):
         r"""

diff --git a/pytorch_lightning/utilities/apply_func.py b/pytorch_lightning/utilities/apply_func.py
@@ -15,10 +15,13 @@
 from abc import ABC
 from collections.abc import Mapping, Sequence
 from copy import copy
-from typing import Any, Callable, Union, Optional
+from functools import partial
+from typing import Any, Callable, Optional, Union
 
+import numpy as np
 import torch
 
+from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.imports import _TORCHTEXT_AVAILABLE
 
 if _TORCHTEXT_AVAILABLE:
@@ -27,11 +30,18 @@
     Batch = type(None)
 
 
+CONVERSION_DTYPES = [
+    (bool, torch.bool),
+    (int, torch.int),
+    (float, torch.float),
+    (np.ndarray, None),
+]
+
+
 def apply_to_collection(data: Any, dtype: Union[type, tuple], function: Callable, *args,
                         wrong_dtype: Optional[Union[type, tuple]] = None, **kwargs) -> Any:
     """
     Recursively applies a function to all elements of a certain dtype.
-
     Args:
         data: the collection to apply the function to
         dtype: the given function will be applied to all elements of this dtype
@@ -40,10 +50,8 @@ def apply_to_collection(data: Any, dtype: Union[type, tuple], function: Callable
         wrong_dtype: the given function won't be applied if this type is specified and the given collections is of
             the :attr:`wrong_type` even if it is of type :attr`dtype`
         **kwargs: keyword arguments (will be forwarded to calls of ``function``)
-
     Returns:
         the resulting collection
-
     """
     elem_type = type(data)
 
@@ -67,9 +75,7 @@ def apply_to_collection(data: Any, dtype: Union[type, tuple], function: Callable
 class TransferableDataType(ABC):
     """
     A custom type for data that can be moved to a torch device via `.to(...)`.
-
     Example:
-
         >>> isinstance(dict, TransferableDataType)
         False
         >>> isinstance(torch.rand(2, 3), TransferableDataType)
@@ -96,15 +102,12 @@ def move_data_to_device(batch: Any, device: torch.device):
     """
     Transfers a collection of data to the given device. Any object that defines a method
     ``to(device)`` will be moved and all other objects in the collection will be left untouched.
-
     Args:
         batch: A tensor or collection of tensors or anything that has a method `.to(...)`.
             See :func:`apply_to_collection` for a list of supported collection types.
         device: The device to which the data should be moved
-
     Return:
         the same collection but with all contained tensors residing on the new device.
-
     See Also:
         - :meth:`torch.Tensor.to`
         - :class:`torch.device`
@@ -128,3 +131,23 @@ def batch_to(data):
 
     dtype = (TransferableDataType, Batch) if _TORCHTEXT_AVAILABLE else TransferableDataType
     return apply_to_collection(batch, dtype=dtype, function=batch_to)
+
+
+def to_dtype_tensor(value, dtype:torch.dtype = None, device: torch.device = None):
+    if device is None:
+        raise MisconfigurationException(
+            "device (torch.device) should be provided."
+        )
+    if isinstance(value, np.ndarray):
+        return torch.from_numpy(value).to(device)
+    return torch.tensor(value, dtype=dtype, device=device)
+
+
+def convert_to_tensors(data, device: torch.device = None):
+    if device is None:
+        raise MisconfigurationException(
+            "device (torch.device) should be provided."
+        )
+    for src_dtype, dst_dtype in CONVERSION_DTYPES:
+        data = apply_to_collection(data, src_dtype, partial(to_dtype_tensor, dtype=dst_dtype, device=device))
+    return data
diff --git a/pytorch_lightning/utilities/distributed.py b/pytorch_lightning/utilities/distributed.py
@@ -22,7 +22,7 @@
 from pytorch_lightning import _logger as log
 
 if torch.distributed.is_available():
-    from torch.distributed import ReduceOp, group
+    from torch.distributed import group, ReduceOp
 else:
     class ReduceOp:
         SUM = None

diff --git a/tests/special_tests.sh b/tests/special_tests.sh
@@ -20,4 +20,6 @@ python ${DEFAULTS} tests/plugins/test_rpc_plugin.py::test_rpc_function_calls_ddp
 python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual
 python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_manual_amp
 python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_automatic
+python ${DEFAULTS} tests/utilities/test_all_gather_grad.py::test_all_gather_properly_works
+# python ${DEFAULTS} tests/plugins/test_ddp_sequential_plugin.py::test_ddp_sequential_plugin_ddp_rpc_with_wrong_balance
 python ${DEFAULTS} tests/trainer/logging_tests/test_train_loop_logging_1_0.py::test_logging_sync_dist_true_ddp
diff --git a/tests/utilities/test_all_gather_grad.py b/tests/utilities/test_all_gather_grad.py
@@ -1,9 +1,13 @@
 import os
-import pytest
 import sys
+
+import pytest
 import torch
+import numpy as np
 
+from pytorch_lightning import Trainer, seed_everything
 from pytorch_lightning.utilities import AllGatherGrad
+from tests.base.boring_model import BoringModel
 
 
 def setup_ddp(rank, world_size):
@@ -41,3 +45,52 @@ def _test_all_gather_ddp(rank, world_size):
 def test_all_gather_ddp():
     world_size = 3
     torch.multiprocessing.spawn(_test_all_gather_ddp, args=(world_size,), nprocs=world_size)
+
+
+@pytest.mark.skipif(sys.platform == "win32", reason="DDP not available on windows")
+@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
+@pytest.mark.skipif(not os.getenv("PL_RUNNING_SPECIAL_TESTS", '0') == '1',
+                    reason="test should be run outside of pytest")
+def test_all_gather_collection(tmpdir):
+
+    class TestModel(BoringModel):
+
+        training_epoch_end_called = False
+
+        def training_epoch_end(self, outputs) -> None:
+            self.training_epoch_end_called = True
+            losses = torch.stack([x["loss"] for x in outputs])
+            gathered_loss = self.all_gather({
+                "losses_np_ndarray": np.array([1, 2, 3]),
+                "losses_bool": [True, False],
+                "losses_float": [0., 1., 2.],
+                "losses_int": [0, 1, 2],
+                "losses": losses,
+                "losses_list": [losses, losses]
+            })
+            assert gathered_loss["losses_np_ndarray"][0].dtype == torch.int64
+            assert gathered_loss["losses_bool"][0].dtype == torch.bool
+            assert gathered_loss["losses_float"][0].dtype == torch.float
+            assert gathered_loss["losses_int"][0].dtype == torch.int
+            assert gathered_loss["losses_list"][0].numel() == 2 * len(losses)
+            assert gathered_loss["losses"].numel() == 2 * len(losses)
+
+    seed_everything(42)
+
+    model = TestModel()
+
+    limit_train_batches = 8
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        limit_train_batches=limit_train_batches,
+        limit_val_batches=2,
+        max_epochs=1,
+        log_every_n_steps=1,
+        accumulate_grad_batches=2,
+        enable_pl_optimizer=True,
+        gpus=2,
+        accelerator="ddp",
+    )
+
+    trainer.fit(model)
+    assert model.training_epoch_end_called