Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(wandb): log models as artifacts #6231

Merged
merged 60 commits into from
May 27, 2021
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
bfb8872
feat(wandb): log models as artifacts
borisdayma Feb 27, 2021
541b001
feat: add Logger.connect
borisdayma Feb 28, 2021
bbd8633
fix: circular ref with type checking
borisdayma Mar 1, 2021
3365261
feat(wandb): use connect method
borisdayma Mar 1, 2021
dfd7553
style: pep8
borisdayma Mar 1, 2021
6950d3d
fix(configure_logger): logger can be bool
borisdayma Mar 1, 2021
f9cc20f
feat(connect): Trainer is not optional
borisdayma Mar 1, 2021
c518d71
feat(configure_logger): make trainer a proxy
borisdayma Mar 3, 2021
9b9aaa6
fix: unused import
borisdayma Mar 3, 2021
eb2080d
docs: more explicit doc
borisdayma Mar 3, 2021
7d98a99
doc: update docstring
borisdayma Mar 3, 2021
a6ad9aa
feat: ModelCheckpoint metadata
borisdayma Mar 3, 2021
444a4eb
Merge branch 'master' into feat_artifacts
borisdayma Mar 3, 2021
52b642f
feat: 1 checkpoint = 1 artifact
borisdayma Mar 4, 2021
765d081
feat: proxy typing + apply suggestions
borisdayma Mar 4, 2021
49f3688
Merge branch 'master' into feat_artifacts
borisdayma Mar 4, 2021
4a55e46
feat: don't log same model twice
borisdayma Mar 4, 2021
f16231c
fix: typo
borisdayma Mar 4, 2021
cbbf8ff
feat: log artifacts during training
borisdayma Mar 4, 2021
123cd88
fix: docs build
borisdayma Mar 4, 2021
0822d5d
feat: use proxy ref
borisdayma Mar 4, 2021
ee5b1d1
Merge branch 'master' into feat_artifacts
borisdayma Mar 4, 2021
947ab7a
fix: mypy
borisdayma Mar 4, 2021
03af2c3
fix: unused import
borisdayma Mar 4, 2021
743903c
fix: continuous logging logic
borisdayma Mar 4, 2021
363b3ac
fix: formatting
borisdayma Mar 5, 2021
7e331c1
docs: update log_model
borisdayma Mar 5, 2021
b438940
docs(wandb): improve log_model
borisdayma Mar 5, 2021
0dc78cc
feat(wandb): more explicit artifact name
borisdayma Mar 5, 2021
78cfc7c
feat(wandb): simplify artifact name
borisdayma Mar 5, 2021
eeed466
docs(wandb): improve documentation
borisdayma Mar 7, 2021
5227329
Merge branch 'master'
borisdayma Mar 7, 2021
cc0fcd6
test: after_save_checkpoint called
borisdayma Mar 7, 2021
a71603d
docs(wandb): fix typo
borisdayma Mar 7, 2021
ded7204
test(wandb): test log_model
borisdayma Mar 7, 2021
1b88a5e
feat(wandb): min version
borisdayma Mar 7, 2021
4f35813
test(wandb): fix directory creation
borisdayma Mar 7, 2021
876dbee
docs: update CHANGELOG
borisdayma Mar 8, 2021
ba1e937
test(wandb): fix variable not defined
borisdayma Mar 8, 2021
9593557
Merge branch 'master' into feat_artifacts
borisdayma Mar 8, 2021
fe98f4f
feat: after_save_checkpoint on rank 0 only
borisdayma Mar 9, 2021
4b38fc4
Merge branch 'master' into feat_artifacts
borisdayma Mar 10, 2021
b59fdf1
Merge branch 'master' into feat_artifacts
borisdayma Mar 11, 2021
13a730b
Merge branch 'master' into feat_artifacts
borisdayma Mar 12, 2021
aa904ce
feat: handle new args of ModelCheckpoint
borisdayma Mar 12, 2021
27c49eb
test(wandb): check correct metadata
borisdayma Mar 12, 2021
e0a9578
tests(wandb): unused fixture
borisdayma Mar 14, 2021
bbf4683
Merge branch 'master' of https://github.com/PyTorchLightning/pytorch-…
borisdayma Mar 14, 2021
58193e8
feat: logger.after_save_checkpoint always exists
borisdayma Mar 14, 2021
fda377f
test: wandb fixture required
borisdayma Mar 14, 2021
ce6c912
Merge branch 'master' of https://github.com/PyTorchLightning/pytorch-…
borisdayma Apr 1, 2021
5e39044
Merge branch 'master' of https://github.com/PyTorchLightning/pytorch-…
borisdayma Apr 8, 2021
62d5cae
Merge branch 'master' of https://github.com/PyTorchLightning/pytorch-…
borisdayma May 14, 2021
0b7bb39
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 14, 2021
c06fc8f
test(wandb): parameter unset
borisdayma May 14, 2021
0ca8310
Merge branch 'master' of https://github.com/PyTorchLightning/pytorch-…
borisdayma May 26, 2021
0ca6abb
formatting
awaelchli May 27, 2021
f6f8f61
typo fix
awaelchli May 27, 2021
1faa389
fix typo in docs
awaelchli May 27, 2021
e0f302f
Merge branch 'master' into feat_artifacts
awaelchli May 27, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion pytorch_lightning/loggers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,18 @@
from abc import ABC, abstractmethod
from argparse import Namespace
from functools import wraps
from typing import Any, Callable, Dict, Iterable, List, Mapping, MutableMapping, Optional, Sequence, Tuple, Union
from typing import (Any, Callable, Dict, Iterable, List, Mapping, MutableMapping,
Optional, Sequence, Tuple, Union, TYPE_CHECKING)

import numpy as np
import torch

from pytorch_lightning.core.lightning import LightningModule
from pytorch_lightning.utilities import rank_zero_only

if TYPE_CHECKING:
from pytorch_lightning.trainer.trainer import Trainer


def rank_zero_experiment(fn: Callable) -> Callable:
""" Returns the real experiment on rank 0 and otherwise the DummyExperiment. """
Expand Down Expand Up @@ -71,6 +75,15 @@ def __init__(
self._agg_key_funcs = agg_key_funcs if agg_key_funcs else {}
self._agg_default_func = agg_default_func

def connect(self, trainer: 'Trainer') -> None:
borisdayma marked this conversation as resolved.
Show resolved Hide resolved
"""
Connect trainer to logger

Args:
trainer: the trainer instance to connect to
"""
pass

def update_agg_funcs(
self,
agg_key_funcs: Optional[Mapping[str, Callable[[Sequence[float]], float]]] = None,
Expand Down Expand Up @@ -355,6 +368,10 @@ def __init__(self, logger_iterable: Iterable[LightningLoggerBase]):
def __getitem__(self, index: int) -> LightningLoggerBase:
return [logger for logger in self._logger_iterable][index]

def connect(self, trainer: 'Trainer') -> None:
borisdayma marked this conversation as resolved.
Show resolved Hide resolved
for logger in self._logger_iterable:
logger.connect(trainer)

def update_agg_funcs(
self,
agg_key_funcs: Optional[Mapping[str, Callable[[Sequence[float]], float]]] = None,
Expand Down
48 changes: 36 additions & 12 deletions pytorch_lightning/loggers/wandb.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
-------------------------
"""
import os
import re
from argparse import Namespace
from typing import Any, Dict, Optional, Union
from pathlib import Path
from typing import Any, Dict, Optional, Union, TYPE_CHECKING

import torch.nn as nn

Expand All @@ -26,6 +28,9 @@
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.warnings import WarningCache

if TYPE_CHECKING:
from pytorch_lightning.trainer.trainer import Trainer

warning_cache = WarningCache()

_WANDB_AVAILABLE = _module_available("wandb")
Expand Down Expand Up @@ -56,7 +61,7 @@ class WandbLogger(LightningLoggerBase):
version: Same as id.
anonymous: Enables or explicitly disables anonymous logging.
project: The name of the project to which this run will belong.
log_model: Save checkpoints in wandb dir to upload on W&B servers.
log_model: Save checkpoints as W&B artifacts.
prefix: A string to put at the beginning of metric keys.
experiment: WandB experiment object. Automatically set when creating a run.
\**kwargs: Additional arguments like `entity`, `group`, `tags`, etc. used by
Expand All @@ -75,9 +80,6 @@ class WandbLogger(LightningLoggerBase):
wandb_logger = WandbLogger()
trainer = Trainer(logger=wandb_logger)

Note: When logging manually through `wandb.log` or `trainer.logger.experiment.log`,
make sure to use `commit=False` so the logging step does not increase.

See Also:
- `Tutorial <https://colab.research.google.com/drive/16d1uctGaw2y9KhGBlINNTsWpmlXdJwRW?usp=sharing>`__
on how to use W&B with PyTorch Lightning
Expand Down Expand Up @@ -132,6 +134,7 @@ def __init__(
self._prefix = prefix
self._experiment = experiment
self._kwargs = kwargs
self._trainer = None

def __getstate__(self):
state = self.__dict__.copy()
Expand All @@ -142,6 +145,10 @@ def __getstate__(self):
state['_experiment'] = None
return state

def connect(self, trainer: 'Trainer') -> None:
borisdayma marked this conversation as resolved.
Show resolved Hide resolved
if trainer is not None:
borisdayma marked this conversation as resolved.
Show resolved Hide resolved
self._trainer = trainer

@property
@rank_zero_experiment
def experiment(self) -> Run:
Expand All @@ -168,10 +175,6 @@ def experiment(self) -> Run:
**self._kwargs
) if wandb.run is None else wandb.run

# save checkpoints in wandb dir to upload on W&B servers
if self._save_dir is None:
self._save_dir = self._experiment.dir

# define default x-axis (for latest wandb versions)
if getattr(self._experiment, "define_metric", None):
self._experiment.define_metric("trainer/global_step")
Expand Down Expand Up @@ -215,6 +218,27 @@ def version(self) -> Optional[str]:

@rank_zero_only
borisdayma marked this conversation as resolved.
Show resolved Hide resolved
def finalize(self, status: str) -> None:
# upload all checkpoints from saving dir
if self._log_model:
wandb.save(os.path.join(self.save_dir, "*.ckpt"))
# save checkpoints as artifacts
if self._log_model and self._trainer is not None and self._trainer.checkpoint_callback is not None:
# use run name and ensure it's a valid Artifact name
artifact_name = re.sub(r"[^a-zA-Z0-9_\.\-]", "", self.experiment.name)
# get checkpoints to be saved with associated score
checkpoints = {
self._trainer.checkpoint_callback.last_model_path: self._trainer.checkpoint_callback.current_score,
self._trainer.checkpoint_callback.best_model_path: self._trainer.checkpoint_callback.best_model_score,
**self._trainer.checkpoint_callback.best_k_models}
checkpoints.pop('', None)
ordered_checkpoints = sorted([(Path(p).stat().st_mtime, p, s)
for p, s in checkpoints.items() if Path(p).is_file()])
# log iteratively all checkpoints
for _, p, s in ordered_checkpoints:
metadata = {'score': s, 'original_filename': Path(p).name,
'ModelCheckpoint': {k: getattr(self._trainer.checkpoint_callback, k) for k in [
'monitor', 'mode', 'save_last', 'save_top_k', 'save_weights_only', 'period'
]}}
artifact = wandb.Artifact(name=f"run-{artifact_name}", type="model", metadata=metadata)
artifact.add_file(p, name='model.ckpt')
self.experiment.log_artifact(
artifact,
aliases=["latest", "best"] if p == self._trainer.checkpoint_callback.best_model_path
else ["latest"])
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import os
from copy import deepcopy
from pprint import pprint
from weakref import proxy
from typing import Dict, Iterable, Optional, Union

import torch
Expand Down Expand Up @@ -143,6 +144,7 @@ def should_update_logs(self):
return should_log_every_n_steps or self.trainer.should_stop

def configure_logger(self, logger):
# connect logger to trainer
borisdayma marked this conversation as resolved.
Show resolved Hide resolved
if logger is True:
version = os.environ.get('PL_EXP_VERSION', self.trainer.slurm_job_id)

Expand All @@ -158,6 +160,10 @@ def configure_logger(self, logger):
else:
self.trainer.logger = logger

# connect trainer to logger
if hasattr(self.trainer.logger, 'connect'):
self.trainer.logger.connect(proxy(self.trainer))

def cache_training_step_metrics(self, opt_closure_result):
"""
This function is responsible to update
Expand Down