Merge pull request #875 from ChangxinTian/data

Add tensorboard and improve logger
RUCAIBox · Jul 11, 2021 · 1090915 · 1090915
2 parents c02ab8a + 2f449e7
commit 1090915
Show file tree

Hide file tree

Showing 17 changed files with 92 additions and 75 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@
 .idea/
 *.pyc
 *.log
+log_tensorboard/*
 saved/
 *.lprof
 *.egg-info/

diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -15,23 +15,23 @@ requirements:
     - pandas >=1.0.5
     - tqdm >=4.48.2
     - pyyaml >=5.1.0
-    - matplotlib >=3.1.3
     - scikit-learn >=0.23.2
     - pytorch
     - colorlog==4.7.2
     - colorama==0.4.4
+    - tensorboard >=2.5.0
   run:
     - python
     - numpy >=1.17.2
     - scipy ==1.6.0
     - pandas >=1.0.5
     - tqdm >=4.48.2
     - pyyaml >=5.1.0
-    - matplotlib >=3.1.3
     - scikit-learn >=0.23.2
     - pytorch
     - colorlog==4.7.2
     - colorama==0.4.4
+    - tensorboard >=2.5.0
 test:
   imports:
     - recbole

diff --git a/docs/source/user_guide/config_settings.rst b/docs/source/user_guide/config_settings.rst
@@ -56,7 +56,6 @@ model training and evaluation.
   which will clips gradient norm of model. Defaults to ``None``.
 - ``loss_decimal_place(int)``: The decimal place of training loss. Defaults to ``4``.
 - ``weight_decay (float)`` : Weight decay (L2 penalty), used for `optimizer <https://pytorch.org/docs/stable/optim.html?highlight=weight_decay>`_. Default to ``0.0``.
-- ``draw_loss_pic (bool)``: Draw the training loss line graph of model if it's ``True``, the pic is a PDF file and will be saved in your run directory after model training. Default to ``False``.
 
 
 **Evaluation Setting**

diff --git a/recbole/config/configurator.py b/recbole/config/configurator.py
@@ -21,8 +21,7 @@
 
 from recbole.evaluator import group_metrics, individual_metrics
 from recbole.utils import get_model, Enum, EvaluatorType, ModelType, InputType, \
-    general_arguments, training_arguments, evaluation_arguments, dataset_arguments
-from recbole.utils.utils import set_color
+    general_arguments, training_arguments, evaluation_arguments, dataset_arguments, set_color
 
 
 class Config(object):

diff --git a/recbole/data/dataset/dataset.py b/recbole/data/dataset/dataset.py
@@ -26,8 +26,7 @@
 from scipy.sparse import coo_matrix
 
 from recbole.data.interaction import Interaction
-from recbole.utils import FeatureSource, FeatureType, get_local_time
-from recbole.utils.utils import set_color
+from recbole.utils import FeatureSource, FeatureType, get_local_time, set_color
 from recbole.utils.url import decide_download, download_url, extract_zip, makedirs, rename_atomic_files
 
 

diff --git a/recbole/data/dataset/kg_dataset.py b/recbole/data/dataset/kg_dataset.py
@@ -20,8 +20,7 @@
 from scipy.sparse import coo_matrix
 
 from recbole.data.dataset import Dataset
-from recbole.utils import FeatureSource, FeatureType
-from recbole.utils.utils import set_color
+from recbole.utils import FeatureSource, FeatureType, set_color
 from recbole.utils.url import decide_download, download_url, extract_zip
 
 

diff --git a/recbole/data/utils.py b/recbole/data/utils.py
@@ -19,8 +19,7 @@
 
 from recbole.data.dataloader import *
 from recbole.sampler import KGSampler, Sampler, RepeatableSampler
-from recbole.utils import ModelType, ensure_dir, get_local_time
-from recbole.utils.utils import set_color
+from recbole.utils import ModelType, ensure_dir, get_local_time, set_color
 
 
 def create_dataset(config):

diff --git a/recbole/model/abstract_recommender.py b/recbole/model/abstract_recommender.py
@@ -19,8 +19,7 @@
 import torch.nn as nn
 
 from recbole.model.layers import FMEmbedding, FMFirstOrderLinear
-from recbole.utils import ModelType, InputType, FeatureSource, FeatureType
-from recbole.utils.utils import set_color
+from recbole.utils import ModelType, InputType, FeatureSource, FeatureType, set_color
 
 
 class AbstractRecommender(nn.Module):

diff --git a/recbole/properties/dataset/sample.yaml b/recbole/properties/dataset/sample.yaml
@@ -26,15 +26,10 @@ additional_feat_suffix: ~
 
 # Filtering
 rm_dup_inter: ~
-lowest_val: ~
-highest_val: ~
-equal_val: ~
-not_equal_val: ~
+val_interval: ~
 filter_inter_by_user_or_item: True
-max_user_inter_num: ~
-min_user_inter_num: 0
-max_item_inter_num: ~
-min_item_inter_num: 0
+item_inter_num_interval: "[0,inf)"
+user_inter_num_interval: "[0,inf)"
 
 # Preprocessing
 alias_of_user_id: ~

diff --git a/recbole/quick_start/quick_start.py b/recbole/quick_start/quick_start.py
@@ -11,8 +11,7 @@
 
 from recbole.config import Config
 from recbole.data import create_dataset, data_preparation
-from recbole.utils import init_logger, get_model, get_trainer, init_seed
-from recbole.utils.utils import set_color
+from recbole.utils import init_logger, get_model, get_trainer, init_seed, set_color
 
 
 def run_recbole(model=None, dataset=None, config_file_list=None, config_dict=None, saved=True):

diff --git a/recbole/trainer/trainer.py b/recbole/trainer/trainer.py
@@ -30,8 +30,7 @@
 from recbole.data.interaction import Interaction
 from recbole.evaluator import ProxyEvaluator
 from recbole.utils import ensure_dir, get_local_time, early_stopping, calculate_valid_score, dict2str, \
-    DataLoaderType, KGDataLoaderState
-from recbole.utils.utils import set_color
+    DataLoaderType, KGDataLoaderState, get_tensorboard, set_color
 
 
 class AbstractTrainer(object):
@@ -77,6 +76,7 @@ def __init__(self, config, model):
         super(Trainer, self).__init__(config, model)
 
         self.logger = getLogger()
+        self.tensorboard = get_tensorboard(self.logger)
         self.learner = config['learner']
         self.learning_rate = config['learning_rate']
         self.epochs = config['epochs']
@@ -92,7 +92,6 @@ def __init__(self, config, model):
         saved_model_file = '{}-{}.pth'.format(self.config['model'], get_local_time())
         self.saved_model_file = os.path.join(self.checkpoint_dir, saved_model_file)
         self.weight_decay = config['weight_decay']
-        self.draw_loss_pic = config['draw_loss_pic']
 
         self.start_epoch = 0
         self.cur_step = 0
@@ -245,6 +244,13 @@ def _generate_train_loss_output(self, epoch_idx, s_time, e_time, losses):
             train_loss_output += set_color('train loss', 'blue') + ': ' + des % losses
         return train_loss_output + ']'
 
+    def _add_train_loss_to_tensorboard(self, epoch_idx, losses, tag='Loss/Train'):
+        if isinstance(losses, tuple):
+            for idx, loss in enumerate(losses):
+                self.tensorboard.add_scalar(tag + str(idx), loss, epoch_idx)
+        else:
+            self.tensorboard.add_scalar(tag, losses, epoch_idx)
+
     def fit(self, train_data, valid_data=None, verbose=True, saved=True, show_progress=False, callback_fn=None):
         r"""Train the model based on the train data and the valid data.
 
@@ -274,6 +280,7 @@ def fit(self, train_data, valid_data=None, verbose=True, saved=True, show_progre
                 self._generate_train_loss_output(epoch_idx, training_start_time, training_end_time, train_loss)
             if verbose:
                 self.logger.info(train_loss_output)
+            self._add_train_loss_to_tensorboard(epoch_idx, train_loss)
 
             # eval
             if self.eval_step <= 0 or not valid_data:
@@ -301,6 +308,8 @@ def fit(self, train_data, valid_data=None, verbose=True, saved=True, show_progre
                 if verbose:
                     self.logger.info(valid_score_output)
                     self.logger.info(valid_result_output)
+                self.tensorboard.add_scalar('Vaild_score', valid_score, epoch_idx)
+
                 if update_flag:
                     if saved:
                         self._save_checkpoint(epoch_idx)
@@ -318,9 +327,6 @@ def fit(self, train_data, valid_data=None, verbose=True, saved=True, show_progre
                     if verbose:
                         self.logger.info(stop_output)
                     break
-        if self.draw_loss_pic:
-            save_path = '{}-{}-train_loss.pdf'.format(self.config['model'], get_local_time())
-            self.plot_train_loss(save_path=os.path.join(save_path))
         return self.best_valid_score, self.best_valid_result
 
     def _full_sort_batch_eval(self, batched_data):
@@ -425,30 +431,6 @@ def _spilt_predict(self, interaction, batch_size):
             result_list.append(result)
         return torch.cat(result_list, dim=0)
 
-    def plot_train_loss(self, show=True, save_path=None):
-        r"""Plot the train loss in each epoch
-
-        Args:
-            show (bool, optional): Whether to show this figure, default: True
-            save_path (str, optional): The data path to save the figure, default: None.
-                                       If it's None, it will not be saved.
-        """
-        import matplotlib.pyplot as plt
-        import time
-        epochs = list(self.train_loss_dict.keys())
-        epochs.sort()
-        values = [float(self.train_loss_dict[epoch]) for epoch in epochs]
-        plt.plot(epochs, values)
-        my_x_ticks = np.arange(0, len(epochs), int(len(epochs) / 10))
-        plt.xticks(my_x_ticks)
-        plt.xlabel('Epoch')
-        plt.ylabel('Loss')
-        plt.title(self.config['model'] + ' ' + time.strftime("%Y-%m-%d %H:%M", time.localtime(time.time())))
-        if show:
-            plt.show()
-        if save_path:
-            plt.savefig(save_path)
-
 
 class KGTrainer(Trainer):
     r"""KGTrainer is designed for Knowledge-aware recommendation methods. Some of these models need to train the
@@ -543,6 +525,7 @@ def pretrain(self, train_data, verbose=True, show_progress=False):
                 self._generate_train_loss_output(epoch_idx, training_start_time, training_end_time, train_loss)
             if verbose:
                 self.logger.info(train_loss_output)
+            self._add_train_loss_to_tensorboard(epoch_idx, train_loss)
 
             if (epoch_idx + 1) % self.config['save_step'] == 0:
                 saved_model_file = os.path.join(
@@ -614,6 +597,7 @@ def __init__(self, config, model):
         super(DecisionTreeTrainer, self).__init__(config, model)
 
         self.logger = getLogger()
+        self.tensorboard = get_tensorboard(self.logger)
         self.label_field = config['LABEL_FIELD']
         self.convert_token_to_onehot = self.config['convert_token_to_onehot']
 
@@ -718,6 +702,7 @@ def fit(self, train_data, valid_data=None, verbose=True, saved=True, show_progre
                 if verbose:
                     self.logger.info(valid_score_output)
                     self.logger.info(valid_result_output)
+                self.tensorboard.add_scalar('Vaild_score', valid_score, epoch_idx)
 
                 self.best_valid_score = valid_score
                 self.best_valid_result = valid_result
@@ -837,6 +822,7 @@ def pretrain(self, train_data, verbose=True, show_progress=False):
                 self._generate_train_loss_output(epoch_idx, training_start_time, training_end_time, train_loss)
             if verbose:
                 self.logger.info(train_loss_output)
+            self._add_train_loss_to_tensorboard(epoch_idx, train_loss)
 
             if (epoch_idx + 1) % self.pretrain_epochs == 0:
                 saved_model_file = os.path.join(
@@ -1012,6 +998,7 @@ def fit(self, train_data, valid_data=None, verbose=True, saved=True, show_progre
                 self._generate_train_loss_output(epoch_idx, training_start_time, training_end_time, train_loss)
             if verbose:
                 self.logger.info(train_loss_output)
+            self._add_train_loss_to_tensorboard(epoch_idx, train_loss)
 
             # eval
             if self.eval_step <= 0 or not valid_data:
@@ -1039,6 +1026,7 @@ def fit(self, train_data, valid_data=None, verbose=True, saved=True, show_progre
                 if verbose:
                     self.logger.info(valid_score_output)
                     self.logger.info(valid_result_output)
+                self.tensorboard.add_scalar('Vaild_score', valid_score, epoch_idx)
                 if update_flag:
                     if saved:
                         self._save_checkpoint(epoch_idx)

diff --git a/recbole/utils/__init__.py b/recbole/utils/__init__.py
@@ -1,12 +1,12 @@
-from recbole.utils.logger import init_logger
+from recbole.utils.logger import init_logger, set_color
 from recbole.utils.utils import get_local_time, ensure_dir, get_model, get_trainer, \
-    early_stopping, calculate_valid_score, dict2str, init_seed
+    early_stopping, calculate_valid_score, dict2str, init_seed, get_tensorboard
 from recbole.utils.enum_type import *
 from recbole.utils.argument_list import *
 
 __all__ = [
     'init_logger', 'get_local_time', 'ensure_dir', 'get_model', 'get_trainer', 'early_stopping',
     'calculate_valid_score', 'dict2str', 'Enum', 'ModelType', 'DataLoaderType', 'KGDataLoaderState', 'EvaluatorType',
     'InputType', 'FeatureType', 'FeatureSource', 'init_seed', 'general_arguments', 'training_arguments',
-    'evaluation_arguments', 'dataset_arguments'
+    'evaluation_arguments', 'dataset_arguments', 'get_tensorboard', 'set_color'
 ]
diff --git a/recbole/utils/argument_list.py b/recbole/utils/argument_list.py
@@ -45,8 +45,8 @@
     'ITEM_LIST_LENGTH_FIELD', 'LIST_SUFFIX', 'MAX_ITEM_LIST_LENGTH', 'POSITION_FIELD',
     'HEAD_ENTITY_ID_FIELD', 'TAIL_ENTITY_ID_FIELD', 'RELATION_ID_FIELD', 'ENTITY_ID_FIELD',
     'load_col', 'unload_col', 'unused_col', 'additional_feat_suffix',
-    'max_user_inter_num', 'min_user_inter_num', 'max_item_inter_num', 'min_item_inter_num',
-    'lowest_val', 'highest_val', 'equal_val', 'not_equal_val',
+    'user_inter_num_interval', 'item_inter_num_interval ',
+    'val_interval',
     'alias_of_user_id', 'alias_of_item_id', 'alias_of_entity_id', 'alias_of_relation_id',
     'preload_weight',
     'normalize_field', 'normalize_all'

diff --git a/recbole/utils/logger.py b/recbole/utils/logger.py
@@ -16,6 +16,7 @@
 import logging
 import os
 import colorlog
+import re
 
 from recbole.utils.utils import get_local_time, ensure_dir
 from colorama import init
@@ -28,6 +29,29 @@
 }
 
 
+class RemoveColorFilter(logging.Filter):
+    def filter(self, record):
+        if record:
+            ansi_escape = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
+            record.msg = ansi_escape.sub('', str(record.msg))
+        return True
+
+
+def set_color(log, color, highlight=True):
+    color_set = ['black', 'red', 'green', 'yellow', 'blue', 'pink', 'cyan', 'white']
+    try:
+        index = color_set.index(color)
+    except:
+        index = len(color_set) - 1
+    prev_log = '\033['
+    if highlight:
+        prev_log += '1;3'
+    else:
+        prev_log += '0;3'
+    prev_log += str(index) + 'm'
+    return prev_log + log + '\033[0m'
+
+
 def init_logger(config):
     """
     A logger that can show a message on standard output and write it into the
@@ -70,12 +94,15 @@ def init_logger(config):
         level = logging.CRITICAL
     else:
         level = logging.INFO
+
     fh = logging.FileHandler(logfilepath)
     fh.setLevel(level)
     fh.setFormatter(fileformatter)
+    remove_color_filter = RemoveColorFilter()
+    fh.addFilter(remove_color_filter)
 
     sh = logging.StreamHandler()
     sh.setLevel(level)
     sh.setFormatter(sformatter)
 
-    logging.basicConfig(level=level, handlers=[fh, sh])
+    logging.basicConfig(level=level, handlers=[sh, fh])
diff --git a/recbole/utils/utils.py b/recbole/utils/utils.py
@@ -20,6 +20,7 @@
 
 import numpy as np
 import torch
+from torch.utils.tensorboard import SummaryWriter
 
 from recbole.utils.enum_type import ModelType
 
@@ -192,16 +193,28 @@ def init_seed(seed, reproducibility):
         torch.backends.cudnn.deterministic = False
 
 
-def set_color(log, color, highlight=True):
-    color_set = ['black', 'red', 'green', 'yellow', 'blue', 'pink', 'cyan', 'white']
-    try:
-        index = color_set.index(color)
-    except:
-        index = len(color_set) - 1
-    prev_log = '\033['
-    if highlight:
-        prev_log += '1;3'
-    else:
-        prev_log += '0;3'
-    prev_log += str(index) + 'm'
-    return prev_log + log + '\033[0m'
+def get_tensorboard(logger):
+    r""" Creates a SummaryWriter of Tensorboard that can log PyTorch models and metrics into a directory for 
+    visualization within the TensorBoard UI.
+    For the convenience of the user, the naming rule of the SummaryWriter's log_dir is the same as the logger.
+
+    Args:
+        logger: its output filename is used to name the SummaryWriter's log_dir.
+                If the filename is not available, we will name the log_dir according to the current time.
+
+    Returns:
+        SummaryWriter: it will write out events and summaries to the event file.
+    """
+    base_path = 'log_tensorboard'
+
+    dir_name = None
+    for handler in logger.handlers:
+        if hasattr(handler, "baseFilename"):
+            dir_name = os.path.basename(getattr(handler, 'baseFilename')).split('.')[0]
+            break
+    if dir_name is None:
+        dir_name = '{}-{}'.format('model', get_local_time())
+
+    dir_path = os.path.join(base_path, dir_name)
+    writer = SummaryWriter(dir_path)
+    return writer
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,7 @@ @@
     .idea/
     *.pyc
     *.log
+    log_tensorboard/*
     saved/
     *.lprof
     *.egg-info/
@@ Expand Down @@