Lightning-AI · awaelchli · May 27, 2021 · Feb 27, 2021 · Feb 28, 2021 · Mar 1, 2021
@@ -93,6 +93,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Changed `trainer.evaluating` to return `True` if validating or testing ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945))
 
 
+- Changed `WandbLogger(log_model={True/'all'})` to log models as artifacts ([#6231](https://github.com/PyTorchLightning/pytorch-lightning/pull/6231))
+
+
 - Changed `setup()` and `teardown()` stage argument to take any of `{fit,validate,test,predict}` ([#6386](https://github.com/PyTorchLightning/pytorch-lightning/pull/6386))
 
 

@@ -202,7 +202,7 @@ The :class:`~pytorch_lightning.loggers.TestTubeLogger` is available anywhere exc
 Weights and Biases
 ==================
 
-`Weights and Biases <https://www.wandb.com/>`_ is a third-party logger.
+`Weights and Biases <https://docs.wandb.ai/integrations/lightning/>`_ is a third-party logger.
 To use :class:`~pytorch_lightning.loggers.WandbLogger` as your logger do the following.
 First, install the package:
 
@@ -215,9 +215,14 @@ Then configure the logger and pass it to the :class:`~pytorch_lightning.trainer.
 .. code-block:: python
 
     from pytorch_lightning.loggers import WandbLogger
-    wandb_logger = WandbLogger(offline=True)
+
+    # instrument experiment with W&B
+    wandb_logger = WandbLogger(project='MNIST', log_model='all')
     trainer = Trainer(logger=wandb_logger)
 
+    # log gradients and model topology
+    WandbLogger.watch(model)
+
 The :class:`~pytorch_lightning.loggers.WandbLogger` is available anywhere except ``__init__`` in your
 :class:`~pytorch_lightning.core.lightning.LightningModule`.
 
@@ -226,8 +231,8 @@ The :class:`~pytorch_lightning.loggers.WandbLogger` is available anywhere except
     class MyModule(LightningModule):
         def any_lightning_module_function_or_hook(self):
             some_img = fake_image()
-            self.logger.experiment.log({
-                 "generated_images": [wandb.Image(some_img, caption="...")]
+            self.log({
+                "generated_images": [wandb.Image(some_img, caption="...")]
             })
 
 .. seealso::

@@ -23,6 +23,7 @@
 import re
 from copy import deepcopy
 from pathlib import Path
+from weakref import proxy
 from typing import Any, Dict, Optional, Union
 
 import numpy as np
@@ -282,6 +283,10 @@ def save_checkpoint(self, trainer, unused: Optional = None):
         # Mode 3: save last checkpoints
         self._save_last_checkpoint(trainer, monitor_candidates)
 
+        # notify loggers
+        if trainer.is_global_zero and trainer.logger:
+            trainer.logger.after_save_checkpoint(proxy(self))
+
     def _should_skip_saving_checkpoint(self, trainer) -> bool:
         from pytorch_lightning.trainer.states import TrainerState
         return (

@@ -19,13 +19,15 @@
 from abc import ABC, abstractmethod
 from argparse import Namespace
 from functools import wraps
+from weakref import ReferenceType
 from typing import Any, Callable, Dict, Iterable, List, Mapping, MutableMapping, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import torch
 
 from pytorch_lightning.core.lightning import LightningModule
 from pytorch_lightning.utilities import rank_zero_only
+from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
 
 
 def rank_zero_experiment(fn: Callable) -> Callable:
@@ -71,6 +73,15 @@ def __init__(
         self._agg_key_funcs = agg_key_funcs if agg_key_funcs else {}
         self._agg_default_func = agg_default_func
 
+    def after_save_checkpoint(self, checkpoint_callback: 'ReferenceType[ModelCheckpoint]') -> None:
+        """
+        Called after model checkpoint callback saves a new checkpoint
+
+        Args:
+            model_checkpoint: the model checkpoint callback instance
+        """
+        pass
+
     def update_agg_funcs(
         self,
         agg_key_funcs: Optional[Mapping[str, Callable[[Sequence[float]], float]]] = None,
@@ -357,6 +368,10 @@ def __init__(self, logger_iterable: Iterable[LightningLoggerBase]):
     def __getitem__(self, index: int) -> LightningLoggerBase:
         return [logger for logger in self._logger_iterable][index]
 
+    def after_save_checkpoint(self, checkpoint_callback: 'ReferenceType[ModelCheckpoint]') -> None:
+        for logger in self._logger_iterable:
+            logger.after_save_checkpoint(checkpoint_callback)
+
     def update_agg_funcs(
         self,
         agg_key_funcs: Optional[Mapping[str, Callable[[Sequence[float]], float]]] = None,

@@ -17,18 +17,24 @@
 """
 import os
 from argparse import Namespace
+from pathlib import Path
 from typing import Any, Dict, Optional, Union
+from weakref import ReferenceType
+import operator
 
 import torch.nn as nn
 
 from pytorch_lightning.loggers.base import LightningLoggerBase, rank_zero_experiment
 from pytorch_lightning.utilities import _module_available, rank_zero_only
+from pytorch_lightning.utilities.imports import _compare_version
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.warnings import WarningCache
+from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
 
 warning_cache = WarningCache()
 
 _WANDB_AVAILABLE = _module_available("wandb")
+_WANDB_GREATER_EQUAL_0_10_22 = _compare_version("wandb", operator.ge, "0.10.22")
 
 try:
     import wandb
@@ -40,7 +46,7 @@
 
 class WandbLogger(LightningLoggerBase):
     r"""
-    Log using `Weights and Biases <https://www.wandb.com/>`_.
+    Log using `Weights and Biases <https://docs.wandb.ai/integrations/lightning>`_.
 
     Install it with pip:
 
@@ -56,7 +62,15 @@ class WandbLogger(LightningLoggerBase):
         version: Same as id.
         anonymous: Enables or explicitly disables anonymous logging.
         project: The name of the project to which this run will belong.
-        log_model: Save checkpoints in wandb dir to upload on W&B servers.
+        log_model: Log checkpoints created by :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint`
+            as W&B artifacts.
+
+            * if ``log_model == 'all'``, checkpoints are logged during training.
+            * if ``log_model == True``, checkpoints are logged at the end of training, except when
+              :paramref:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint.save_top_k` ``== -1``
+              which also logs every checkpoint during training.
+            * if ``log_model == False`` (default), no checkpoint is logged.
+
         prefix: A string to put at the beginning of metric keys.
         experiment: WandB experiment object. Automatically set when creating a run.
         \**kwargs: Additional arguments like `entity`, `group`, `tags`, etc. used by
@@ -72,15 +86,16 @@ class WandbLogger(LightningLoggerBase):
 
         from pytorch_lightning.loggers import WandbLogger
         from pytorch_lightning import Trainer
-        wandb_logger = WandbLogger()
+
+        # instrument experiment with W&B
+        wandb_logger = WandbLogger(project='MNIST', log_model='all')
         trainer = Trainer(logger=wandb_logger)
 
-    Note: When logging manually through `wandb.log` or `trainer.logger.experiment.log`,
-    make sure to use `commit=False` so the logging step does not increase.
+        # log gradients and model topology
+        WandbLogger.watch(model)
 
     See Also:
-        - `Tutorial <https://colab.research.google.com/drive/16d1uctGaw2y9KhGBlINNTsWpmlXdJwRW?usp=sharing>`__
-          on how to use W&B with PyTorch Lightning
+        - `Demo in Google Colab <http://wandb.me/lightning>`__ with model logging
         - `W&B Documentation <https://docs.wandb.ai/integrations/lightning>`__
 
     """
@@ -115,6 +130,13 @@ def __init__(
                 'Hint: Set `offline=False` to log your model.'
             )
 
+        if log_model and not _WANDB_GREATER_EQUAL_0_10_22:
+            warning_cache.warn(
+                f'Providing log_model={log_model} requires wandb version >= 0.10.22'
+                ' for logging associated model metadata.\n'
+                'Hint: Upgrade with `pip install --ugrade wandb`.'
+            )
+
         if sync_step is not None:
             warning_cache.warn(
                 "`WandbLogger(sync_step=(True|False))` is deprecated in v1.2.1 and will be removed in v1.5."
@@ -132,6 +154,8 @@ def __init__(
         self._prefix = prefix
         self._experiment = experiment
         self._kwargs = kwargs
+        self._logged_model_time = {}
+        self._checkpoint_callback = None
 
     def __getstate__(self):
         state = self.__dict__.copy()
@@ -168,10 +192,6 @@ def experiment(self) -> Run:
                 **self._kwargs
             ) if wandb.run is None else wandb.run
 
-            # save checkpoints in wandb dir to upload on W&B servers
-            if self._save_dir is None:
-                self._save_dir = self._experiment.dir
-
             # define default x-axis (for latest wandb versions)
             if getattr(self._experiment, "define_metric", None):
                 self._experiment.define_metric("trainer/global_step")
@@ -213,8 +233,43 @@ def version(self) -> Optional[str]:
         # don't create an experiment if we don't have one
         return self._experiment.id if self._experiment else self._id
 
+    def after_save_checkpoint(self, checkpoint_callback: 'ReferenceType[ModelCheckpoint]') -> None:
+        # log checkpoints as artifacts
+        if self._log_model == 'all' or self._log_model is True and checkpoint_callback.save_top_k == -1:
+            self._scan_and_log_checkpoints(checkpoint_callback)
+        elif self._log_model is True:
+            self._checkpoint_callback = checkpoint_callback
+
     @rank_zero_only
     def finalize(self, status: str) -> None:
-        # upload all checkpoints from saving dir
-        if self._log_model:
-            wandb.save(os.path.join(self.save_dir, "*.ckpt"))
+        # log checkpoints as artifacts
+        if self._checkpoint_callback:
+            self._scan_and_log_checkpoints(self._checkpoint_callback)
+
+    def _scan_and_log_checkpoints(self, checkpoint_callback: 'ReferenceType[ModelCheckpoint]') -> None:
+        # get checkpoints to be saved with associated score
+        checkpoints = {
+            checkpoint_callback.last_model_path: checkpoint_callback.current_score,
+            checkpoint_callback.best_model_path: checkpoint_callback.best_model_score,
+            **checkpoint_callback.best_k_models}
+        checkpoints = sorted([(Path(p).stat().st_mtime, p, s)
+                              for p, s in checkpoints.items() if Path(p).is_file()])
+        checkpoints = [c for c in checkpoints
+                       if c[1] not in self._logged_model_time.keys() or self._logged_model_time[c[1]] < c[0]]
+
+        # log iteratively all new checkpoints
+        for t, p, s in checkpoints:
+            metadata = {'score': s, 'original_filename': Path(p).name,
+                        'ModelCheckpoint': {k: getattr(checkpoint_callback, k) for k in [
+                            'monitor', 'mode', 'save_last', 'save_top_k', 'save_weights_only', '_every_n_train_steps',
+                            '_every_n_val_epochs']
+                            # ensure it does not break if `ModelCheckpoint` args change
+                        if hasattr(checkpoint_callback, k)}} if _WANDB_GREATER_EQUAL_0_10_22 else None
+            artifact = wandb.Artifact(name=f"model-{self.experiment.id}", type="model", metadata=metadata)
+            artifact.add_file(p, name='model.ckpt')
+            self.experiment.log_artifact(
+                artifact,
+                aliases=["latest", "best"] if p == checkpoint_callback.best_model_path
+                else ["latest"])
+            # remember logged models - timestamp needed in case filename didn't change (lastkckpt or custom name)
+            self._logged_model_time[p] = t
@@ -60,6 +60,7 @@ def __init__(self):
         self.hparams_logged = None
         self.metrics_logged = {}
         self.finalized = False
+        self.after_save_checkpoint_called = False
 
     @property
     def experiment(self):
@@ -93,6 +94,9 @@ def name(self):
     def version(self):
         return "1"
 
+    def after_save_checkpoint(self, checkpoint_callback):
+        self.after_save_checkpoint_called = True
+
 
 def test_custom_logger(tmpdir):
 
@@ -116,6 +120,7 @@ def training_step(self, batch, batch_idx):
     assert trainer.state == TrainerState.FINISHED, f"Training failed with {trainer.state}"
     assert logger.hparams_logged == model.hparams
     assert logger.metrics_logged != {}
+    assert logger.after_save_checkpoint_called
     assert logger.finalized_status == "success"
 
 

@@ -25,14 +25,8 @@
 from tests.helpers import BoringModel
 
 
-def get_warnings(recwarn):
-    warnings_text = '\n'.join(str(w.message) for w in recwarn.list)
-    recwarn.clear()
-    return warnings_text
-
-
 @mock.patch('pytorch_lightning.loggers.wandb.wandb')
-def test_wandb_logger_init(wandb, recwarn):
+def test_wandb_logger_init(wandb):
     """Verify that basic functionality of wandb logger works.
     Wandb doesn't work well with pytest so we have to mock it out here."""
 
@@ -127,10 +121,8 @@ def test_wandb_logger_dirs_creation(wandb, tmpdir):
 
     # mock return values of experiment
     wandb.run = None
-    wandb.init().step = 0
     logger.experiment.id = '1'
     logger.experiment.project_name.return_value = 'project'
-    logger.experiment.step = 0
 
     for _ in range(2):
         _ = logger.experiment
@@ -151,6 +143,61 @@ def test_wandb_logger_dirs_creation(wandb, tmpdir):
     assert trainer.log_dir == logger.save_dir
 
 
+@mock.patch('pytorch_lightning.loggers.wandb.wandb')
+def test_wandb_log_model(wandb, tmpdir):
+    """ Test that the logger creates the folders and files in the right place. """
+
+    wandb.run = None
+    model = BoringModel()
+
+    # test log_model=True
+    logger = WandbLogger(log_model=True)
+    logger.experiment.id = '1'
+    logger.experiment.project_name.return_value = 'project'
+    trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=2, limit_train_batches=3, limit_val_batches=3)
+    trainer.fit(model)
+    wandb.init().log_artifact.assert_called_once()
+
+    # test log_model='all'
+    wandb.init().log_artifact.reset_mock()
+    wandb.init.reset_mock()
+    logger = WandbLogger(log_model='all')
+    logger.experiment.id = '1'
+    logger.experiment.project_name.return_value = 'project'
+    trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=2, limit_train_batches=3, limit_val_batches=3)
+    trainer.fit(model)
+    assert wandb.init().log_artifact.call_count == 2
+
+    # test log_model=False
+    wandb.init().log_artifact.reset_mock()
+    wandb.init.reset_mock()
+    logger = WandbLogger(log_model=False)
+    logger.experiment.id = '1'
+    logger.experiment.project_name.return_value = 'project'
+    trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=2, limit_train_batches=3, limit_val_batches=3)
+    trainer.fit(model)
+    assert not wandb.init().log_artifact.called
+
+    # test correct metadata
+    import pytorch_lightning.loggers.wandb as pl_wandb
+    pl_wandb._WANDB_GREATER_EQUAL_0_10_22 = True
+    wandb.init().log_artifact.reset_mock()
+    wandb.init.reset_mock()
+    wandb.Artifact.reset_mock()
+    logger = pl_wandb.WandbLogger(log_model=True)
+    logger.experiment.id = '1'
+    logger.experiment.project_name.return_value = 'project'
+    trainer = Trainer(default_root_dir=tmpdir, logger=logger, max_epochs=2, limit_train_batches=3, limit_val_batches=3)
+    trainer.fit(model)
+    wandb.Artifact.assert_called_once_with(name='model-1', type='model',
+                                           metadata={'score': None, 'original_filename': 'epoch=1-step=5-v3.ckpt',
+                                                     'ModelCheckpoint': {'monitor': None, 'mode': 'min',
+                                                                         'save_last': None, 'save_top_k': None,
+                                                                         'save_weights_only': False,
+                                                                         '_every_n_train_steps': 0,
+                                                                         '_every_n_val_epochs': 1}})
+
+
 def test_wandb_sanitize_callable_params(tmpdir):
     """
     Callback function are not serializiable. Therefore, we get them a chance to return