Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Integrate wandb #35

Merged
merged 5 commits into from
Apr 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 98 additions & 17 deletions theseus/base/callbacks/wandb_callbacks.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
from typing import List, Dict
import os
import os.path as osp
from theseus.base.callbacks.base_callbacks import Callbacks
from theseus.utilities.loggers.observer import LoggerObserver
from theseus.utilities.loggers.wandb_logger import WandbLogger
from theseus.utilities.loggers.wandb_logger import WandbLogger, find_run_id
from datetime import datetime
from theseus.opt import Config
from copy import deepcopy

try:
import wandb as wandblogger
except ModuleNotFoundError:
pass

LOGGER = LoggerObserver.getLogger("main")

Expand All @@ -19,38 +27,111 @@ class WandbCallbacks(Callbacks):
project name of Wandb
resume: `bool`
whether to resume project

::Usage::
Register in the pipeline.yaml. For instance:

callbacks:
- name: WandbCallbacks
args:
username: kaylode
project_name: theseus

"""

def __init__(self,
username: str,
project_name: str,
save_dir: str = None,
resume: str = None,
config_dict: Dict = None,
**kwargs) -> None:
super().__init__()

self.username = username
self.project_name = project_name
self.resume = resume
self.save_dir = save_dir
self.config_dict = config_dict

# A hack, not good
if save_dir is None:
run_name = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
self.run_name = run_name
if self.save_dir is None:
self.run_name = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
self.run_name = osp.basename(save_dir)

if self.resume is None:
self.id = wandblogger.util.generate_id()
else:
try:
# Get run id
id = find_run_id(
os.path.dirname(os.path.dirname(self.resume))
)

# Load the config from that run
try:
old_config_path = wandblogger.restore(
'pipeline.yaml',
run_path = f"{self.username}/{self.project_name}/{id}"
).name
except:
raise ValueError(f"Falid to load run id={id}, due to pipeline.yaml is missing or run is not existed")

# Check if the config remains the same, if not, create new run id
old_config_dict = Config(old_config_path)
tmp_config_dict = deepcopy(self.config_dict)
## strip off global key because `resume` will always different
old_config_dict.pop('global', None)
tmp_config_dict.pop('global', None)
if old_config_dict == tmp_config_dict:
self.id = id
LOGGER.text("Run configuration remains unchanged. Resuming wandb run...", LoggerObserver.SUCCESS)
else:
self.id = wandblogger.util.generate_id()
LOGGER.text("Run configuration changes since the last run. Creating new wandb run...", LoggerObserver.WARN)
except ValueError as e:
LOGGER.text(f"Can not resume wandb due to '{e}'. Creating new wandb run...", LoggerObserver.WARN)
self.id = wandblogger.util.generate_id()

"""
All the logging stuffs have been done in LoggerCallbacks. Here we just register
the wandb logger to the main logger
"""
wandb_logger = WandbLogger(
self.username, self.project_name, self.run_name
self.wandb_logger = WandbLogger(
id = self.id,
save_dir = self.save_dir,
username = self.username,
project_name = self.project_name,
run_name = self.run_name,
config_dict=self.config_dict
)
LOGGER.subscribe(wandb_logger)
LOGGER.subscribe(self.wandb_logger)

def on_start(self, logs: Dict=None):
"""
Before going to the main loop. Save run id
"""
wandb_id_file = osp.join(self.save_dir, 'wandb_id.txt')
with open(wandb_id_file, 'w') as f:
f.write(self.id)

# Save all config files
self.wandb_logger.log_file(
tag='configs',
value = osp.join(self.save_dir, '*.yaml'))

# Init logging model for debug
self.wandb_logger.log_torch_module(
tag='models',
value = self.params['trainer'].model.model,
log_freq=10)

def on_finish(self, logs: Dict=None):
"""
After finish training
"""
base_folder=osp.join(self.save_dir, 'checkpoints')
self.wandb_logger.log_file(
tag='checkpoint',
base_folder=self.save_dir,
value = osp.join(base_folder, '*.pth'))

def on_val_epoch_end(self, logs:Dict=None):
"""
On validation batch (iteration) end
"""
base_folder=osp.join(self.save_dir, 'checkpoints')
self.wandb_logger.log_file(
tag='checkpoint',
base_folder=self.save_dir,
value = osp.join(base_folder, '*.pth'))
52 changes: 52 additions & 0 deletions theseus/utilities/download.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
import gdown
import os
import os.path as osp
import urllib.request as urlreq
from theseus.utilities.loggers.observer import LoggerObserver

LOGGER = LoggerObserver.getLogger('main')

def download_from_drive(id_or_url, output, md5=None, quiet=False, cache=True):
if id_or_url.startswith('http') or id_or_url.startswith('https'):
Expand All @@ -10,3 +16,49 @@ def download_from_drive(id_or_url, output, md5=None, quiet=False, cache=True):
return gdown.download(url, output, quiet=quiet)
else:
return gdown.cached_download(url, md5=md5, quiet=quiet)

def download_from_url(url, root=None, filename=None):
"""Download a file from a url and place it in root.
Args:
url (str): URL to download file from
root (str): Directory to place downloaded file in
filename (str, optional): Name to save the file under. If None, use the basename of the URL
"""

if root is None:
root = './.cache'
root = os.path.expanduser(root)
if not filename:
filename = os.path.basename(url)
fpath = os.path.join(root, filename)

if osp.isfile(fpath):
LOGGER.text('Load cache from ' + fpath, level=LoggerObserver.INFO)
return fpath

os.makedirs(root, exist_ok=True)

try:
LOGGER.text('Downloading ' + url + ' to ' + fpath, level=LoggerObserver.DEBUG)
urlreq.urlretrieve(url, fpath)
except (urlreq.error.URLError, IOError) as e:
if url[:5] == 'https':
url = url.replace('https:', 'http:')
LOGGER.text(
'Failed download. Trying https -> http instead.Downloading ' + url + ' to ' + fpath,
level=LoggerObserver.DEBUG)
urlreq.urlretrieve(url, fpath)

return fpath


def download_from_wandb(filename, run_path, save_dir):
import wandb
try:
path = wandb.restore(
filename, run_path=run_path, root=save_dir)
return path.name
except:
LOGGER.text("Failed to download from wandb.",
level=LoggerObserver.ERROR)
return None
109 changes: 97 additions & 12 deletions theseus/utilities/loggers/wandb_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
except ModuleNotFoundError:
pass

import os.path as osp
import torch
from theseus.utilities.loggers.observer import LoggerObserver, LoggerSubscriber
LOGGER = LoggerObserver.getLogger('main')
Expand All @@ -13,13 +14,17 @@ class WandbLogger(LoggerSubscriber):
Logger for wandb intergration
:param log_dir: Path to save checkpoint
"""
def __init__(self, username:str, project_name:str, run_name:str, config_dict:Dict = None):
def __init__(self, id:str, username:str, project_name:str, run_name:str, save_dir:str = None, config_dict:Dict = None):
self.project_name = project_name
self.username = username
self.run_name = run_name
self.config_dict = config_dict
self.id = id
self.save_dir = save_dir

wandb_logger.init(
id = id,
dir = self.save_dir,
config=config_dict,
entity=username,
project=project_name,
Expand All @@ -35,6 +40,17 @@ def load_state_dict(self, path):
else:
return None

def log_file(self, tag, value, base_folder=None, **kwargs):
"""
Write a file to wandb
:param tag: (str) tag
:param value: (str) path to file

:param base_folder: (str) folder to save file to
"""
wandb_logger.save(value, base_path=base_folder)


def log_scalar(self, tag, value, step, **kwargs):
"""
Write a log to specified directory
Expand All @@ -43,34 +59,103 @@ def log_scalar(self, tag, value, step, **kwargs):
:param step: (int) logging step
"""

wandb_logger.log({tag: value}, step=step)
wandb_logger.log({
tag: value,
'iterations': step
})

def log_figure(self, tag, value, step, **kwargs):
"""
Write a matplotlib fig to tensorboard
Write a matplotlib fig to wandb
:param tags: (str) tag for log
:param value: (image) image to log. torch.Tensor or plt.fire.Figure
:param step: (int) logging step
"""


if isinstance(value, torch.Tensor):
image = wandb_logger.Image(value)
wandb_logger.log({
tag: image
}, step=step)
tag: image,
'iterations': step
})
else:
wandb_logger.log({
tag: value
}, step=step)
tag: value,
'iterations': step
})

def log_torch_module(self, tag, value, **kwargs):
def log_torch_module(self, tag, value, log_freq, **kwargs):
"""
Write a model graph to tensorboard
Write a model graph to wandb
:param value: (nn.Module) torch model
:param inputs: sample tensor
"""
wandb_logger.watch(value, log="all")
wandb_logger.watch(
value,
log="gradients",
log_freq=log_freq)

def log_spec_text(self, tag, value, step, **kwargs):
"""
Write a text to wandb
:param value: (str) captions
"""
texts = wandb_logger.Html(value)
wandb_logger.log({
tag: texts,
'iterations': step
})

def log_table(self, tag, value, columns, step, **kwargs):
"""
Write a table to wandb
:param value: list of column values
:param columns: list of column names

Examples:
value = [
[0, fig1, 0],
[1, fig2, 8],
[2, fig3, 7],
[3, fig4, 1]
]
columns=[
"id",
"image",
"prediction"
]
"""

# Workaround for tensor image, have not figured out how to use plt.Figure :<
new_value = []
for record in value:
new_record = []
for val in record:
if isinstance(val, torch.Tensor):
val = wandb_logger.Image(val)
new_record.append(val)
new_value.append(new_record)

table = wandb_logger.Table(data=new_value, columns=columns)
wandb_logger.log({
tag: table,
'iterations': step
})

def __del__(self):
wandb_logger.finish()
wandb_logger.finish()


def find_run_id(dirname):
"""
Read a .txt file which contains wandb run id
"""

wandb_id_file = osp.join(dirname, 'wandb_id.txt')

if not osp.isfile(wandb_id_file):
raise ValueError(f"Wandb ID file not found in {wandb_id_file}")
else:
with open(wandb_id_file, 'r') as f:
wandb_id = f.read().rstrip()
return wandb_id