Merge branch 'master' into bugfix/closure-result

Lightning-AI · Sep 10, 2021 · 65ab8d9 · 65ab8d9
2 parents cdf5edd + c963bf6
commit 65ab8d9
Show file tree

Hide file tree

Showing 28 changed files with 414 additions and 317 deletions.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -21,10 +21,10 @@
 # Packages
 /pytorch_lightning/accelerators         @williamfalcon @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11
 /pytorch_lightning/callbacks            @williamfalcon @tchaton @carmocca @borda @kaushikb11
-/pytorch_lightning/cluster_environments @borda @tchaton @SeanNaren @carmocca @kaushikb11
 /pytorch_lightning/core                 @tchaton @SeanNaren @borda @carmocca @justusschock @kaushikb11
 /pytorch_lightning/distributed          @williamfalcon @tchaton @awaelchli @kaushikb11
 /pytorch_lightning/loggers              @tchaton @awaelchli @borda
+/pytorch_lightning/loggers/wandb.py     @borisdayma
 /pytorch_lightning/loops                @tchaton @awaelchli @justusschock @carmocca
 /pytorch_lightning/overrides            @tchaton @SeanNaren @borda
 /pytorch_lightning/plugins              @tchaton @SeanNaren @awaelchli @justusschock
@@ -38,11 +38,6 @@
 /pytorch_lightning/trainer/connectors/logger_connector @tchaton @carmocca
 /pytorch_lightning/trainer/progress.py  @tchaton @awaelchli @carmocca
 
-# Metrics
-/pytorch_lightning/metrics/             @SkafteNicki @ananyahjha93 @justusschock
-/tests/metrics/                         @SkafteNicki @ananyahjha93 @justusschock
-/docs/source/metrics.rst                @SkafteNicki @ananyahjha93 @justusschock
-
 # API
 /pytorch_lightning/callbacks/base.py    @williamfalcon @awaelchli @ananthsub @carmocca
 /pytorch_lightning/core/datamodule.py   @williamFalcon @awaelchli @ananthsub @carmocca

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -59,6 +59,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
     * Added Fault Tolerant Training to `DataFetcher` ([#8891](https://github.com/PyTorchLightning/pytorch-lightning/pull/8891))
     * Replaced old prefetch iterator with new `DataFetcher` in training loop ([#8953](https://github.com/PyTorchLightning/pytorch-lightning/pull/8953))
     * Added partial support for global random state fault-tolerance in map-style datasets ([#8950](https://github.com/PyTorchLightning/pytorch-lightning/pull/8950))
+    * Converted state to tuple explicitly when setting Python random state ([#9401](https://github.com/PyTorchLightning/pytorch-lightning/pull/9401))
+
 
 - Checkpoint saving & loading extensibility:
     * Added `CheckpointIO` to expose checkpoint IO from training type plugin ([#8743](https://github.com/PyTorchLightning/pytorch-lightning/pull/8743))
@@ -107,9 +109,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Added `on_exception` callback hook ([#9183](https://github.com/PyTorchLightning/pytorch-lightning/pull/9183))
 
+
 - Add a warning to deepspeed when inferring batch size ([#9221](https://github.com/PyTorchLightning/pytorch-lightning/pull/9221))
 
 
+- Added `inference_mode` for evaluation and prediction ([8813](https://github.com/PyTorchLightning/pytorch-lightning/pull/8813))
+
+
 ### Changed
 
 - Parsing of the `gpus` Trainer argument has changed: `gpus="n"` (str) no longer selects the GPU index n and instead selects the first n devices. ([#8770](https://github.com/PyTorchLightning/pytorch-lightning/pull/8770))
@@ -173,6 +179,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Deprecated `DataModule` properties: `train_transforms`, `val_transforms`, `test_transforms`, `size`, `dims` ([#8851](https://github.com/PyTorchLightning/pytorch-lightning/pull/8851))
 
 
+- Deprecated `LightningModule.get_progress_bar_dict` and `Trainer.progress_bar_dict` in favor of `pytorch_lightning.callbacks.progress.base.get_standard_metrics` and `ProgressBarBase.get_metrics` ([#8985](https://github.com/PyTorchLightning/pytorch-lightning/pull/8985))
+
+
 - Deprecated `prepare_data_per_node` flag on Trainer and set it as a property of `DataHooks`, accessible in the `LightningModule` and `LightningDataModule` ([#8958](https://github.com/PyTorchLightning/pytorch-lightning/pull/8958))
 
 
@@ -289,7 +298,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed `EarlyStopping` running on train epoch end when `check_val_every_n_epoch>1` is set ([#9156](https://github.com/PyTorchLightning/pytorch-lightning/pull/9156))
 
 
-- Fixed an issue with logger outputs not being finalized correctly after prediction runs ([#8333](https://github.com/PyTorchLightning/pytorch-lightning/issues/8333))
+- Fixed an issue with logger outputs not being finalized correctly after prediction runs ([#8685](https://github.com/PyTorchLightning/pytorch-lightning/pull/8685))
 
 
 - Fixed the Apex and DeepSpeed plugin closure running after the `on_before_optimizer_step` hook ([#9288](https://github.com/PyTorchLightning/pytorch-lightning/issues/9288))
@@ -319,12 +328,18 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed incorrect main progress bar indicator when resuming training mid-epoch ([#9310](https://github.com/PyTorchLightning/pytorch-lightning/pull/9310))
 
 
+- Fixed logging of nan parameters ([#9364](https://github.com/PyTorchLightning/pytorch-lightning/pull/9364))
+
+
 - Fixed `replace_sampler` missing the batch size under specific conditions ([#9367](https://github.com/PyTorchLightning/pytorch-lightning/pull/9367))
 
 
 - Fixed bug where the training step output needed to be `deepcopy`-ed ([#9349](https://github.com/PyTorchLightning/pytorch-lightning/pull/9349))
 
 
+- Fixed freeing data iterators in loop `on_run_end` ([#9386](https://github.com/PyTorchLightning/pytorch-lightning/pull/9386))
+
+
 ## [1.4.5] - 2021-08-31
 
 - Fixed reduction using `self.log(sync_dict=True, reduce_fx={mean,max})` ([#9142](https://github.com/PyTorchLightning/pytorch-lightning/pull/9142))

diff --git a/CITATION.cff b/CITATION.cff
@@ -4,8 +4,8 @@ title: "PyTorch Lightning"
 abstract: "The lightweight PyTorch wrapper for high-performance AI research. Scale your models, not the boilerplate."
 date-released: 2019-03-30
 authors:
-  - family-names: "William"
-    given-names: "Falcon"
+  - family-names: "Falcon"
+    given-names: "William"
   - name: "The PyTorch Lightning team"
 version: 1.4
 doi: 10.5281/zenodo.3828935

diff --git a/docs/source/common/lightning_module.rst b/docs/source/common/lightning_module.rst
@@ -1242,12 +1242,6 @@ backward
 .. automethod:: pytorch_lightning.core.lightning.LightningModule.backward
     :noindex:
 
-get_progress_bar_dict
-~~~~~~~~~~~~~~~~~~~~~
-
-.. automethod:: pytorch_lightning.core.lightning.LightningModule.get_progress_bar_dict
-    :noindex:
-
 on_before_backward
 ~~~~~~~~~~~~~~~~~~
 

diff --git a/docs/source/extensions/logging.rst b/docs/source/extensions/logging.rst
@@ -245,13 +245,13 @@ Modifying the progress bar
 
 The progress bar by default already includes the training loss and version number of the experiment
 if you are using a logger. These defaults can be customized by overriding the
-:func:`~pytorch_lightning.core.lightning.LightningModule.get_progress_bar_dict` hook in your module.
+:func:`~pytorch_lightning.callbacks.base.ProgressBarBase.get_metrics` hook in your module.
 
 .. code-block:: python
 
-    def get_progress_bar_dict(self):
+    def get_metrics(self):
         # don't show the version number
-        items = super().get_progress_bar_dict()
+        items = super().get_metrics()
         items.pop("v_num", None)
         return items
 

diff --git a/pytorch_lightning/accelerators/accelerator.py b/pytorch_lightning/accelerators/accelerator.py
@@ -173,15 +173,7 @@ def batch_to_device(self, batch: Any, device: Optional[torch.device] = None, dat
     def training_step(self, step_kwargs: Dict[str, Union[Any, int]]) -> STEP_OUTPUT:
         """The actual training step.
 
-        Args:
-            step_kwargs: the arguments for the models training step. Can consist of the following:
-
-                - batch (:class:`~torch.Tensor` | (:class:`~torch.Tensor`, ...) | [:class:`~torch.Tensor`, ...]):
-                  The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list.
-                - batch_idx (int): Integer displaying index of this batch
-                - optimizer_idx (int): When using multiple optimizers, this argument will also be present.
-                - hiddens(:class:`~torch.Tensor`): Passed in if
-                  :paramref:`~pytorch_lightning.core.lightning.LightningModule.truncated_bptt_steps` > 0.
+        See :meth:`~pytorch_lightning.core.lightning.LightningModule.training_step` for more details
         """
         with self.precision_plugin.train_step_context():
             return self.training_type_plugin.training_step(*step_kwargs.values())
@@ -192,44 +184,23 @@ def post_training_step(self) -> None:
     def validation_step(self, step_kwargs: Dict[str, Union[Any, int]]) -> Optional[STEP_OUTPUT]:
         """The actual validation step.
 
-        Args:
-            step_kwargs: the arguments for the models validation step. Can consist of the following:
-
-                - batch (:class:`~torch.Tensor` | (:class:`~torch.Tensor`, ...) | [:class:`~torch.Tensor`, ...]):
-                  The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list.
-                - batch_idx (int): The index of this batch
-                - dataloader_idx (int): The index of the dataloader that produced this batch
-                  (only if multiple val dataloaders used)
+        See :meth:`~pytorch_lightning.core.lightning.LightningModule.validation_step` for more details
         """
         with self.precision_plugin.val_step_context():
             return self.training_type_plugin.validation_step(*step_kwargs.values())
 
     def test_step(self, step_kwargs: Dict[str, Union[Any, int]]) -> Optional[STEP_OUTPUT]:
         """The actual test step.
 
-        Args:
-            step_kwargs: the arguments for the models test step. Can consist of the following:
-
-                - batch (:class:`~torch.Tensor` | (:class:`~torch.Tensor`, ...) | [:class:`~torch.Tensor`, ...]):
-                  The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list.
-                - batch_idx (int): The index of this batch.
-                - dataloader_idx (int): The index of the dataloader that produced this batch
-                  (only if multiple test dataloaders used).
+        See :meth:`~pytorch_lightning.core.lightning.LightningModule.test_step` for more details
         """
         with self.precision_plugin.test_step_context():
             return self.training_type_plugin.test_step(*step_kwargs.values())
 
     def predict_step(self, step_kwargs: Dict[str, Union[Any, int]]) -> STEP_OUTPUT:
         """The actual predict step.
 
-        Args:
-            step_kwargs: the arguments for the models predict step. Can consist of the following:
-
-                - batch (:class:`~torch.Tensor` | (:class:`~torch.Tensor`, ...) | [:class:`~torch.Tensor`, ...]):
-                  The output of your :class:`~torch.utils.data.DataLoader`. A tensor, tuple or list.
-                - batch_idx (int): The index of this batch.
-                - dataloader_idx (int): The index of the dataloader that produced this batch
-                  (only if multiple predict dataloaders used).
+        See :meth:`~pytorch_lightning.core.lightning.LightningModule.predict_step` for more details
         """
         with self.precision_plugin.predict_step_context():
             return self.training_type_plugin.predict_step(*step_kwargs.values())

diff --git a/pytorch_lightning/callbacks/progress/base.py b/pytorch_lightning/callbacks/progress/base.py
@@ -11,7 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Dict, Union
+
+import pytorch_lightning as pl
 from pytorch_lightning.callbacks import Callback
+from pytorch_lightning.utilities import rank_zero_warn
 
 
 class ProgressBarBase(Callback):
@@ -177,3 +181,70 @@ def on_predict_epoch_start(self, trainer, pl_module):
 
     def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, dataloader_idx):
         self._predict_batch_idx += 1
+
+    def get_metrics(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> Dict[str, Union[int, str]]:
+        r"""
+        Combines progress bar metrics collected from the trainer with standard metrics from get_standard_metrics.
+        Implement this to override the items displayed in the progress bar.
+
+        Here is an example of how to override the defaults:
+
+        .. code-block:: python
+
+            def get_metrics(self, trainer, model):
+                # don't show the version number
+                items = super().get_metrics(trainer, model)
+                items.pop("v_num", None)
+                return items
+
+        Return:
+            Dictionary with the items to be displayed in the progress bar.
+        """
+        standard_metrics = pl_module.get_progress_bar_dict()
+        pbar_metrics = trainer.progress_bar_metrics
+        duplicates = list(standard_metrics.keys() & pbar_metrics.keys())
+        if duplicates:
+            rank_zero_warn(
+                f"The progress bar already tracks a metric with the name(s) '{', '.join(duplicates)}' and"
+                f" `self.log('{duplicates[0]}', ..., prog_bar=True)` will overwrite this value. "
+                " If this is undesired, change the name or override `get_metrics()` in the progress bar callback.",
+                UserWarning,
+            )
+
+        return {**standard_metrics, **pbar_metrics}
+
+
+def get_standard_metrics(trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> Dict[str, Union[int, str]]:
+    r"""
+    Returns several standard metrics displayed in the progress bar, including the average loss value,
+    split index of BPTT (if used) and the version of the experiment when using a logger.
+
+    .. code-block::
+
+        Epoch 1:   4%|▎         | 40/1095 [00:03<01:37, 10.84it/s, loss=4.501, v_num=10]
+
+    Return:
+        Dictionary with the standard metrics to be displayed in the progress bar.
+    """
+    # call .item() only once but store elements without graphs
+    running_train_loss = trainer.fit_loop.running_loss.mean()
+    avg_training_loss = None
+    if running_train_loss is not None:
+        avg_training_loss = running_train_loss.cpu().item()
+    elif pl_module.automatic_optimization:
+        avg_training_loss = float("NaN")
+
+    items_dict = {}
+    if avg_training_loss is not None:
+        items_dict["loss"] = f"{avg_training_loss:.3g}"
+
+    if pl_module.truncated_bptt_steps > 0:
+        items_dict["split_idx"] = trainer.fit_loop.split_idx
+
+    if trainer.logger is not None and trainer.logger.version is not None:
+        version = trainer.logger.version
+        # show last 4 places of long version strings
+        version = version[-4:] if isinstance(version, str) else version
+        items_dict["v_num"] = version
+
+    return items_dict
diff --git a/pytorch_lightning/callbacks/progress/rich_progress.py b/pytorch_lightning/callbacks/progress/rich_progress.py
@@ -46,8 +46,9 @@ def render(self, task) -> RenderableType:
     class MetricsTextColumn(ProgressColumn):
         """A column containing text."""
 
-        def __init__(self, trainer, stage):
+        def __init__(self, trainer, pl_module, stage):
             self._trainer = trainer
+            self._pl_module = pl_module
             self._stage = stage
             self._tasks = {}
             self._current_task_id = 0
@@ -64,7 +65,13 @@ def render(self, task) -> Text:
             if self._trainer.training and task.id != self._current_task_id:
                 return self._tasks[task.id]
             _text = ""
-            for k, v in self._trainer.progress_bar_dict.items():
+            # TODO(@daniellepintz): make this code cleaner
+            progress_bar_callback = getattr(self._trainer, "progress_bar_callback", None)
+            if progress_bar_callback:
+                metrics = self._trainer.progress_bar_callback.get_metrics(self._trainer, self._pl_module)
+            else:
+                metrics = self._trainer.progress_bar_metrics
+            for k, v in metrics.items():
                 _text += f"{k}: {round(v, 3) if isinstance(v, float) else v} "
             text = Text.from_markup(_text, style=None, justify="left")
             return text
@@ -163,7 +170,7 @@ def setup(self, trainer, pl_module, stage):
             "[",
             CustomTimeColumn(),
             ProcessingSpeedColumn(),
-            MetricsTextColumn(trainer, stage),
+            MetricsTextColumn(trainer, pl_module, stage),
             "]",
             console=self.console,
             refresh_per_second=self.refresh_rate,

diff --git a/pytorch_lightning/callbacks/progress/tqdm_progress.py b/pytorch_lightning/callbacks/progress/tqdm_progress.py
@@ -237,7 +237,7 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, data
         total_batches = convert_inf(total_batches)
         if self._should_update(self.train_batch_idx, total_batches):
             self._update_bar(self.main_progress_bar)
-            self.main_progress_bar.set_postfix(trainer.progress_bar_dict)
+            self.main_progress_bar.set_postfix(self.get_metrics(trainer, pl_module))
 
     def on_validation_start(self, trainer, pl_module):
         super().on_validation_start(trainer, pl_module)
@@ -257,7 +257,7 @@ def on_validation_batch_end(self, trainer, pl_module, outputs, batch, batch_idx,
     def on_validation_end(self, trainer, pl_module):
         super().on_validation_end(trainer, pl_module)
         if self.main_progress_bar is not None:
-            self.main_progress_bar.set_postfix(trainer.progress_bar_dict)
+            self.main_progress_bar.set_postfix(self.get_metrics(trainer, pl_module))
         self.val_progress_bar.close()
 
     def on_train_end(self, trainer, pl_module):