Lightning-AI · lexierule · Apr 14, 2021 · Apr 6, 2021 · Apr 6, 2021 · Apr 7, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,32 @@ All notable changes to this project will be documented in this file.
 
 The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
+
+## [1.2.8] - 2021-04-14
+
+### Added
+
+- Added TPUSpawn + IterableDataset error message ([#6875](https://github.com/PyTorchLightning/pytorch-lightning/pull/6875))
+
+### Fixed
+
+- Fixed process rank not being available right away after `Trainer` instantiation ([#6941](https://github.com/PyTorchLightning/pytorch-lightning/pull/6941))
+- Fixed `sync_dist` for tpus ([#6950](https://github.com/PyTorchLightning/pytorch-lightning/pull/6950))
+- Fixed `AttributeError` for `require_backward_grad_sync` when running manual optimization with sharded plugin ([#6915](https://github.com/PyTorchLightning/pytorch-lightning/pull/6915))
+- Fixed `--gpus` default for parser returned by `Trainer.add_argparse_args` ([#6898](https://github.com/PyTorchLightning/pytorch-lightning/pull/6898))
+- Fixed TPU Spawn all gather ([#6896](https://github.com/PyTorchLightning/pytorch-lightning/pull/6896))
+- Fixed `EarlyStopping` logic when `min_epochs` or `min_steps` requirement is not met ([#6705](https://github.com/PyTorchLightning/pytorch-lightning/pull/6705))
+- Fixed csv extension check ([#6436](https://github.com/PyTorchLightning/pytorch-lightning/pull/6436))
+- Fixed checkpoint issue when using Horovod distributed backend ([#6958](https://github.com/PyTorchLightning/pytorch-lightning/pull/6958))
+- Fixed tensorboard exception raising ([#6901](https://github.com/PyTorchLightning/pytorch-lightning/pull/6901))
+- Fixed setting the eval/train flag correctly on accelerator model ([#6983](https://github.com/PyTorchLightning/pytorch-lightning/pull/6983))
+- Fixed DDP_SPAWN compatibility with bug_report_model.py ([#6892](https://github.com/PyTorchLightning/pytorch-lightning/pull/6892))
+- Fixed bug where `BaseFinetuning.flatten_modules()` was duplicating leaf node parameters ([#6879](https://github.com/PyTorchLightning/pytorch-lightning/pull/6879))
+- Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic:
+    * Support SLURM and torchelastic global rank environment variables ([#5715](https://github.com/PyTorchLightning/pytorch-lightning/pull/5715))
+    * Remove hardcoding of local rank in accelerator connector ([#6878](https://github.com/PyTorchLightning/pytorch-lightning/pull/6878))
+
+
 ## [1.2.7] - 2021-04-06
 
 ### Fixed

diff --git a/dockers/nvidia/Dockerfile b/dockers/nvidia/Dockerfile
@@ -12,52 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FROM nvcr.io/nvidia/cuda:11.1.1-runtime-ubuntu20.04
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel_21-03.html#rel_21-03
+FROM nvcr.io/nvidia/pytorch:20.12-py3
 
 MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
 
 ARG LIGHTNING_VERSION=""
 
-SHELL ["/bin/bash", "-c"]
-# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
-ENV \
-    DEBIAN_FRONTEND=noninteractive \
-    TZ=Europe/Prague \
-    PATH="$PATH:/root/.local/bin" \
-    CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \
-    MKL_THREADING_LAYER=GNU
-
-RUN apt-get update -qq && \
-    apt-get install -y --no-install-recommends \
-        build-essential \
-        python3 \
-        python3-distutils \
-        python3-dev \
-        pkg-config \
-        cmake \
-        git \
-        wget \
-        unzip \
-        ca-certificates \
-    && \
-
-# Cleaning
-    apt-get autoremove -y && \
-    apt-get clean && \
-    rm -rf /root/.cache && \
-    rm -rf /var/lib/apt/lists/* && \
-
-# Setup PIP
-    update-alternatives --install /usr/bin/python python /usr/bin/python3 1 && \
-    wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \
-    python get-pip.py && \
-    rm get-pip.py && \
-    pip --version
-
-COPY ./ /home/pytorch-lightning/
+COPY ./ /workspace/pytorch-lightning/
 
 RUN \
-    cd /home  && \
+    cd /workspace  && \
     mv pytorch-lightning/notebooks . && \
     mv pytorch-lightning/pl_examples . && \
     # replace by specific version if asked
@@ -71,9 +36,10 @@ RUN \
 
 # Installations
     python -c "fname = './pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \
-    pip install -r ./pytorch-lightning/requirements/extra.txt -U --no-cache-dir && \
-    pip install -r ./pytorch-lightning/requirements/examples.txt -U --no-cache-dir && \
+    pip install -r ./pytorch-lightning/requirements/extra.txt --no-cache-dir --upgrade-strategy only-if-needed && \
+    pip install -r ./pytorch-lightning/requirements/examples.txt --no-cache-dir --upgrade-strategy only-if-needed && \
     pip install ./pytorch-lightning --no-cache-dir && \
+    pip install "Pillow>=8.1" "torchtext>=0.9.0" ipython[all] --no-cache-dir --upgrade-strategy only-if-needed && \
     rm -rf pytorch-lightning
 
 RUN python --version && \

diff --git a/pl_examples/bug_report_model.py b/pl_examples/bug_report_model.py
@@ -126,12 +126,13 @@ def configure_optimizers(self):
 #  args = parser.parse_args(opt)
 
 
-def test_run():
+class TestModel(BoringModel):
+
+    def on_train_epoch_start(self) -> None:
+        print('override any method to prove your bug')
 
-    class TestModel(BoringModel):
 
-        def on_train_epoch_start(self) -> None:
-            print('override any method to prove your bug')
+def test_run():
 
     # fake data
     train_data = torch.utils.data.DataLoader(RandomDataset(32, 64))

diff --git a/pytorch_lightning/accelerators/tpu.py b/pytorch_lightning/accelerators/tpu.py
@@ -1,6 +1,18 @@
-from typing import Any, Callable, Optional, Union
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Callable, TYPE_CHECKING, Union
 
-import torch
 from torch.optim import Optimizer
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
@@ -16,10 +28,19 @@
 
     xla_clip_grad_norm_ = clip_grad_norm_
 
+if TYPE_CHECKING:
+    from pytorch_lightning.core.lightning import LightningModule
+    from pytorch_lightning.trainer.trainer import Trainer
+
 
 class TPUAccelerator(Accelerator):
 
-    def setup(self, trainer, model):
+    def setup(self, trainer: 'Trainer', model: 'LightningModule') -> None:
+        """
+        Raises:
+            MisconfigurationException:
+                If AMP is used with TPU, or if TPUs are not using a single TPU core or TPU spawn training.
+        """
         if isinstance(self.precision_plugin, MixedPrecisionPlugin):
             raise MisconfigurationException(
                 "amp + tpu is not supported. "
@@ -30,24 +51,11 @@ def setup(self, trainer, model):
             raise MisconfigurationException("TPUs only support a single tpu core or tpu spawn training.")
         return super().setup(trainer, model)
 
-    def run_optimizer_step(self, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs):
+    def run_optimizer_step(
+        self, optimizer: Optimizer, optimizer_idx: int, lambda_closure: Callable, **kwargs: Any
+    ) -> None:
         xm.optimizer_step(optimizer, barrier=False, optimizer_args={'closure': lambda_closure, **kwargs})
 
-    def all_gather(self, tensor: Union[torch.Tensor], group: Optional[Any] = None, sync_grads: bool = False):
-        """
-        Function to gather a tensor from several distributed processes
-        Args:
-            tensor: tensor of shape (batch, ...)
-            group: not available with TPUs
-            sync_grads: not available with TPUs
-        Return:
-            A tensor of shape (world_size, batch, ...)
-        """
-        # todo: Add support for backward with all_gather
-        if isinstance(self.training_type_plugin, TPUSpawnPlugin) and self.training_type_plugin.is_distributed:
-            return xm.all_gather(tensor).view(-1, *tensor.shape)
-        return tensor
-
     def clip_gradients(self, optimizer: Optimizer, clip_val: Union[float, int], norm_type: float = 2.0):
 
         model = self.lightning_module

diff --git a/pytorch_lightning/callbacks/finetuning.py b/pytorch_lightning/callbacks/finetuning.py
@@ -22,7 +22,6 @@
 import torch
 from torch.nn import Module
 from torch.nn.modules.batchnorm import _BatchNorm
-from torch.nn.modules.container import Container, ModuleDict, ModuleList, Sequential
 from torch.optim.optimizer import Optimizer
 
 from pytorch_lightning.callbacks.base import Callback
@@ -102,11 +101,8 @@ def flatten_modules(modules: Union[Module, Iterable[Union[Module, Iterable]]]) -
         else:
             _modules = modules.modules()
 
-        return list(
-            filter(
-                lambda m: not isinstance(m, (Container, Sequential, ModuleDict, ModuleList, LightningModule)), _modules
-            )
-        )
+        # Leaf nodes in the graph have no children, so we use that to filter
+        return [m for m in _modules if not list(m.children())]
 
     @staticmethod
     def filter_params(

diff --git a/pytorch_lightning/core/hooks.py b/pytorch_lightning/core/hooks.py
@@ -150,13 +150,13 @@ def on_validation_model_eval(self) -> None:
         """
         Sets the model to eval during the val loop
         """
-        self.eval()
+        self.trainer.model.eval()
 
     def on_validation_model_train(self) -> None:
         """
         Sets the model to train during the val loop
         """
-        self.train()
+        self.trainer.model.train()
 
     def on_validation_batch_start(self, batch: Any, batch_idx: int, dataloader_idx: int) -> None:
         """
@@ -208,19 +208,19 @@ def on_test_model_train(self) -> None:
         """
         Sets the model to train during the test loop
         """
-        self.train()
+        self.trainer.model.train()
 
     def on_test_model_eval(self) -> None:
         """
         Sets the model to eval during the test loop
         """
-        self.eval()
+        self.trainer.model.eval()
 
     def on_predict_model_eval(self) -> None:
         """
         Sets the model to eval during the predict loop
         """
-        self.eval()
+        self.trainer.model.eval()
 
     def on_epoch_start(self) -> None:
         """

diff --git a/pytorch_lightning/core/saving.py b/pytorch_lightning/core/saving.py
@@ -136,7 +136,7 @@ def load_from_checkpoint(
 
         if hparams_file is not None:
             extension = hparams_file.split('.')[-1]
-            if extension.lower() in ('csv'):
+            if extension.lower() == 'csv':
                 hparams = load_hparams_from_tags_csv(hparams_file)
             elif extension.lower() in ('yml', 'yaml'):
                 hparams = load_hparams_from_yaml(hparams_file)

diff --git a/pytorch_lightning/core/step_result.py b/pytorch_lightning/core/step_result.py
@@ -22,7 +22,7 @@
 from torch import Tensor
 from torchmetrics import Metric
 
-from pytorch_lightning.utilities.distributed import sync_ddp_if_available
+from pytorch_lightning.utilities.distributed import sync_ddp_if_available, tpu_distributed
 
 
 class Result(Dict):
@@ -139,10 +139,11 @@ def log(
 
         # sync across workers when using distributed training
         sync_fn = sync_fn or sync_ddp_if_available
+
         if sync_dist and isinstance(value, (torch.Tensor, numbers.Number)):
             is_dist_initialized = torch.distributed.is_available() and torch.distributed.is_initialized()
             # TODO: Find a way to make the reduction only once, so we don't need to clone.
-            if is_dist_initialized and isinstance(value, torch.Tensor):
+            if (is_dist_initialized or tpu_distributed) and isinstance(value, torch.Tensor):
                 value = value.clone()
             else:
                 value = torch.tensor(value, device=device, dtype=torch.float)

diff --git a/pytorch_lightning/info.py b/pytorch_lightning/info.py
@@ -1,7 +1,7 @@
 import time
 
 _this_year = time.strftime("%Y")
-__version__ = '1.2.7'
+__version__ = '1.2.8'
 __author__ = 'William Falcon et al.'
 __author_email__ = 'waf2107@columbia.edu'
 __license__ = 'Apache-2.0'

diff --git a/pytorch_lightning/loggers/tensorboard.py b/pytorch_lightning/loggers/tensorboard.py
@@ -204,7 +204,7 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) ->
                 # todo: specify the possible exception
                 except Exception as ex:
                     m = f'\n you tried to log {v} which is not currently supported. Try a dict or a scalar/tensor.'
-                    type(ex)(ex.message + m)
+                    raise ValueError(m) from ex
 
     @rank_zero_only
     def log_graph(self, model: LightningModule, input_array=None):

diff --git a/pytorch_lightning/plugins/environments/__init__.py b/pytorch_lightning/plugins/environments/__init__.py
@@ -12,5 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment  # noqa: F401
+from pytorch_lightning.plugins.environments.lightning_environment import LightningEnvironment  # noqa: F401
 from pytorch_lightning.plugins.environments.slurm_environment import SLURMEnvironment  # noqa: F401
 from pytorch_lightning.plugins.environments.torchelastic_environment import TorchElasticEnvironment  # noqa: F401
diff --git a/pytorch_lightning/plugins/environments/cluster_environment.py b/pytorch_lightning/plugins/environments/cluster_environment.py
@@ -11,24 +11,44 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from abc import ABC, abstractmethod
 
 
-class ClusterEnvironment:
+class ClusterEnvironment(ABC):
+    """ Specification of a cluster environment. """
 
-    def __init__(self):
-        self._world_size = None
+    @abstractmethod
+    def creates_children(self) -> bool:
+        """ Whether the environment creates the subprocesses or not. """
 
-    def master_address(self):
-        pass
+    @abstractmethod
+    def master_address(self) -> str:
+        """ The master address through which all processes connect and communicate. """
 
-    def master_port(self):
-        pass
+    @abstractmethod
+    def master_port(self) -> int:
+        """ An open and configured port in the master node through which all processes communicate. """
 
+    @abstractmethod
     def world_size(self) -> int:
-        return self._world_size
+        """ The number of processes across all devices and nodes. """
 
-    def local_rank(self) -> int:
+    @abstractmethod
+    def set_world_size(self, size: int) -> None:
         pass
 
-    def node_rank(self) -> int:
+    @abstractmethod
+    def global_rank(self) -> int:
+        """ The rank (index) of the currently running process across all nodes and devices. """
+
+    @abstractmethod
+    def set_global_rank(self, rank: int) -> None:
         pass
+
+    @abstractmethod
+    def local_rank(self) -> int:
+        """ The rank (index) of the currently running process inside of the current node. """
+
+    @abstractmethod
+    def node_rank(self) -> int:
+        """ The rank (index) of the node on which the current process runs. """