kaylode · kaylode · Apr 12, 2022 · Apr 3, 2022 · Apr 12, 2022 · Apr 12, 2022
diff --git a/theseus/base/callbacks/wandb_callbacks.py b/theseus/base/callbacks/wandb_callbacks.py
@@ -1,9 +1,17 @@
 from typing import List, Dict
 import os
+import os.path as osp
 from theseus.base.callbacks.base_callbacks import Callbacks
 from theseus.utilities.loggers.observer import LoggerObserver
-from theseus.utilities.loggers.wandb_logger import WandbLogger
+from theseus.utilities.loggers.wandb_logger import WandbLogger, find_run_id
 from datetime import datetime
+from theseus.opt import Config
+from copy import deepcopy
+
+try:
+    import wandb as wandblogger
+except ModuleNotFoundError:
+    pass
 
 LOGGER = LoggerObserver.getLogger("main")
 
@@ -19,38 +27,111 @@ class WandbCallbacks(Callbacks):
         project name of Wandb
     resume: `bool`
         whether to resume project
-
-    ::Usage::
-    Register in the pipeline.yaml. For instance:
-
-    callbacks:
-    - name: WandbCallbacks
-        args: 
-        username: kaylode
-        project_name: theseus
-
     """
 
     def __init__(self, 
         username: str, 
         project_name: str, 
         save_dir: str = None,
+        resume: str = None,
+        config_dict: Dict = None,
         **kwargs) -> None:
         super().__init__()
 
         self.username = username
         self.project_name = project_name
+        self.resume = resume
+        self.save_dir = save_dir
+        self.config_dict = config_dict
 
         # A hack, not good
-        if save_dir is None:
-            run_name = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
-        self.run_name = run_name
+        if self.save_dir is None:
+            self.run_name = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
+        self.run_name = osp.basename(save_dir)
+
+        if self.resume is None:
+            self.id = wandblogger.util.generate_id()
+        else:
+            try:
+                # Get run id
+                id = find_run_id(
+                    os.path.dirname(os.path.dirname(self.resume))
+                )
+
+                # Load the config from that run
+                try:
+                    old_config_path = wandblogger.restore(
+                        'pipeline.yaml',
+                        run_path = f"{self.username}/{self.project_name}/{id}"
+                    ).name
+                except:
+                    raise ValueError(f"Falid to load run id={id}, due to pipeline.yaml is missing or run is not existed")
+
+                # Check if the config remains the same, if not, create new run id 
+                old_config_dict = Config(old_config_path)
+                tmp_config_dict = deepcopy(self.config_dict)
+                ## strip off global key because `resume` will always different
+                old_config_dict.pop('global', None)
+                tmp_config_dict.pop('global', None)
+                if old_config_dict == tmp_config_dict:
+                    self.id = id
+                    LOGGER.text("Run configuration remains unchanged. Resuming wandb run...", LoggerObserver.SUCCESS)
+                else:
+                    self.id = wandblogger.util.generate_id()
+                    LOGGER.text("Run configuration changes since the last run. Creating new wandb run...", LoggerObserver.WARN)
+            except ValueError as e:
+                LOGGER.text(f"Can not resume wandb due to '{e}'. Creating new wandb run...", LoggerObserver.WARN)
+                self.id = wandblogger.util.generate_id()
 
         """
         All the logging stuffs have been done in LoggerCallbacks. Here we just register 
         the wandb logger to the main logger
         """
-        wandb_logger = WandbLogger(
-            self.username, self.project_name, self.run_name
+        self.wandb_logger = WandbLogger(
+            id = self.id,
+            save_dir = self.save_dir,
+            username = self.username, 
+            project_name = self.project_name, 
+            run_name = self.run_name,
+            config_dict=self.config_dict
         )
-        LOGGER.subscribe(wandb_logger)
+        LOGGER.subscribe(self.wandb_logger)
+
+    def on_start(self, logs: Dict=None):
+        """
+        Before going to the main loop. Save run id
+        """
+        wandb_id_file = osp.join(self.save_dir, 'wandb_id.txt')
+        with open(wandb_id_file, 'w') as f:
+            f.write(self.id)
+
+        # Save all config files
+        self.wandb_logger.log_file(
+            tag='configs', 
+            value = osp.join(self.save_dir, '*.yaml'))
+
+        # Init logging model for debug
+        self.wandb_logger.log_torch_module(
+            tag='models', 
+            value = self.params['trainer'].model.model,
+            log_freq=10)
+
+    def on_finish(self, logs: Dict=None):
+        """
+        After finish training
+        """
+        base_folder=osp.join(self.save_dir, 'checkpoints')
+        self.wandb_logger.log_file(
+            tag='checkpoint', 
+            base_folder=self.save_dir,
+            value = osp.join(base_folder, '*.pth'))
+
+    def on_val_epoch_end(self, logs:Dict=None):
+        """
+        On validation batch (iteration) end
+        """ 
+        base_folder=osp.join(self.save_dir, 'checkpoints')
+        self.wandb_logger.log_file(
+            tag='checkpoint', 
+            base_folder=self.save_dir,
+            value = osp.join(base_folder, '*.pth'))
diff --git a/theseus/utilities/download.py b/theseus/utilities/download.py
@@ -1,4 +1,10 @@
 import gdown
+import os
+import os.path as osp
+import urllib.request as urlreq
+from theseus.utilities.loggers.observer import LoggerObserver
+
+LOGGER = LoggerObserver.getLogger('main')
 
 def download_from_drive(id_or_url, output, md5=None, quiet=False, cache=True):
     if id_or_url.startswith('http') or id_or_url.startswith('https'):
@@ -10,3 +16,49 @@ def download_from_drive(id_or_url, output, md5=None, quiet=False, cache=True):
         return gdown.download(url, output, quiet=quiet)
     else:
         return gdown.cached_download(url, md5=md5, quiet=quiet)
+
+def download_from_url(url, root=None, filename=None):
+    """Download a file from a url and place it in root.
+    Args:
+        url (str): URL to download file from
+        root (str): Directory to place downloaded file in
+        filename (str, optional): Name to save the file under. If None, use the basename of the URL
+    """
+
+    if root is None:
+        root = './.cache'
+    root = os.path.expanduser(root)
+    if not filename:
+        filename = os.path.basename(url)
+    fpath = os.path.join(root, filename)
+
+    if osp.isfile(fpath):
+        LOGGER.text('Load cache from ' + fpath, level=LoggerObserver.INFO)
+        return fpath
+
+    os.makedirs(root, exist_ok=True)
+
+    try:
+        LOGGER.text('Downloading ' + url + ' to ' + fpath, level=LoggerObserver.DEBUG)
+        urlreq.urlretrieve(url, fpath)
+    except (urlreq.error.URLError, IOError) as e:
+        if url[:5] == 'https':
+            url = url.replace('https:', 'http:')
+            LOGGER.text(
+                'Failed download. Trying https -> http instead.Downloading ' + url + ' to ' + fpath, 
+                level=LoggerObserver.DEBUG)
+            urlreq.urlretrieve(url, fpath)
+
+    return fpath
+
+
+def download_from_wandb(filename, run_path, save_dir):
+    import wandb
+    try:
+        path = wandb.restore(
+            filename, run_path=run_path, root=save_dir)
+        return path.name
+    except:
+        LOGGER.text("Failed to download from wandb.",
+                level=LoggerObserver.ERROR)
+        return None
diff --git a/theseus/utilities/loggers/wandb_logger.py b/theseus/utilities/loggers/wandb_logger.py
@@ -4,6 +4,7 @@
 except ModuleNotFoundError:
     pass
 
+import os.path as osp
 import torch
 from theseus.utilities.loggers.observer import LoggerObserver, LoggerSubscriber
 LOGGER = LoggerObserver.getLogger('main')
@@ -13,13 +14,17 @@ class WandbLogger(LoggerSubscriber):
     Logger for wandb intergration
     :param log_dir: Path to save checkpoint
     """
-    def __init__(self, username:str, project_name:str, run_name:str, config_dict:Dict = None):
+    def __init__(self, id:str, username:str, project_name:str, run_name:str, save_dir:str = None, config_dict:Dict = None):
         self.project_name = project_name
         self.username = username
         self.run_name = run_name
         self.config_dict = config_dict
+        self.id = id
+        self.save_dir = save_dir
 
         wandb_logger.init(
+            id = id,
+            dir = self.save_dir,
             config=config_dict,
             entity=username, 
             project=project_name, 
@@ -35,6 +40,17 @@ def load_state_dict(self, path):
         else:
             return None
 
+    def log_file(self, tag, value, base_folder=None, **kwargs):
+        """
+        Write a file to wandb
+        :param tag: (str) tag
+        :param value: (str) path to file
+
+        :param base_folder: (str) folder to save file to
+        """
+        wandb_logger.save(value, base_path=base_folder)
+
+
     def log_scalar(self, tag, value, step, **kwargs):
         """
         Write a log to specified directory
@@ -43,34 +59,103 @@ def log_scalar(self, tag, value, step, **kwargs):
         :param step: (int) logging step
         """
 
-        wandb_logger.log({tag: value}, step=step)
+        wandb_logger.log({
+            tag: value,
+            'iterations': step
+        })
 
     def log_figure(self, tag, value, step, **kwargs):
         """
-        Write a matplotlib fig to tensorboard
+        Write a matplotlib fig to wandb
         :param tags: (str) tag for log
         :param value: (image) image to log. torch.Tensor or plt.fire.Figure
         :param step: (int) logging step
         """
 
-
         if isinstance(value, torch.Tensor):
             image = wandb_logger.Image(value)
             wandb_logger.log({
-               tag: image
-            }, step=step)
+               tag: image,
+               'iterations': step
+            })
         else:
             wandb_logger.log({
-               tag: value
-            }, step=step)
+               tag: value,
+               'iterations': step
+            })
 
-    def log_torch_module(self, tag, value, **kwargs):
+    def log_torch_module(self, tag, value, log_freq, **kwargs):
         """
-        Write a model graph to tensorboard
+        Write a model graph to wandb
         :param value: (nn.Module) torch model
         :param inputs: sample tensor
         """
-        wandb_logger.watch(value, log="all")
+        wandb_logger.watch(
+          value, 
+          log="gradients", 
+          log_freq=log_freq)
+
+    def log_spec_text(self, tag, value, step, **kwargs):
+        """
+        Write a text to wandb
+        :param value: (str) captions
+        """
+        texts = wandb_logger.Html(value)
+        wandb_logger.log({
+            tag: texts,
+            'iterations': step
+        })
+
+    def log_table(self, tag, value, columns, step, **kwargs):
+        """
+        Write a table to wandb
+        :param value: list of column values
+        :param columns: list of column names
+
+        Examples:
+        value = [
+            [0, fig1, 0],
+            [1, fig2, 8],
+            [2, fig3, 7],
+            [3, fig4, 1]
+        ]
+        columns=[
+            "id", 
+            "image", 
+            "prediction"
+        ]
+        """
+
+        # Workaround for tensor image, have not figured out how to use plt.Figure :<
+        new_value = []
+        for record in value:
+            new_record = []
+            for val in record:
+                if isinstance(val, torch.Tensor):
+                    val = wandb_logger.Image(val)
+                new_record.append(val)
+            new_value.append(new_record)
+
+        table = wandb_logger.Table(data=new_value, columns=columns)
+        wandb_logger.log({
+            tag: table,
+            'iterations': step
+        })
 
     def __del__(self):
-        wandb_logger.finish()
+        wandb_logger.finish()
+
+
+def find_run_id(dirname):
+    """
+    Read a .txt file which contains wandb run id
+    """
+
+    wandb_id_file = osp.join(dirname, 'wandb_id.txt')
+
+    if not osp.isfile(wandb_id_file):
+        raise ValueError(f"Wandb ID file not found in {wandb_id_file}")
+    else:
+        with open(wandb_id_file, 'r') as f:
+            wandb_id = f.read().rstrip()
+        return wandb_id