RUCAIBox · linzihan-backforward · Aug 31, 2021 · Aug 30, 2021 · Aug 30, 2021 · Aug 30, 2021
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -55,5 +55,5 @@ jobs:
       run: |
         python -m pytest -v tests/config/test_config.py
         export PYTHONPATH=.
-        python tests/config/test_command_line.py --use_gpu=False --valid_metric=Recall@10 --split_ratio=[0.7,0.2,0.1] --metrics=['Recall@10'] --epochs=200 --eval_setting='LO_RS' --learning_rate=0.3
+        python tests/config/test_command_line.py --use_gpu=False --valid_metric=Recall@10 --split_ratio=[0.7,0.2,0.1] --metrics='["Recall"]' --topk=[10] --epochs=200 --eval_setting='LO_RS' --learning_rate=0.3
 
diff --git a/docs/source/developer_guide/customize_metrics.rst b/docs/source/developer_guide/customize_metrics.rst
@@ -5,22 +5,32 @@ Users can implement their customized metrics and combine the metrics with others
 
 Here, it only takes three steps to incorporate a new metric and we introduce them step by step.
 
+Create a New Metric Class
+--------------------------
+Then, we create a new class in the file :file:`~recbole.evaluator.metrics` and define the parameter in ``__init__()``:
 
-Sign in Your Metric in Register
---------------------------------
-To begin with, we must add a new line in :obj:`~recbole.evaluator.register.metric_information`:
-All the metrics are registered by :obj:`metric_information` which is a dict. Keys are the name of
-metrics and should be lowercase. Value is a list which contain one or multiple string that corresponding
-to needed input of metrics.
+.. code:: python
 
-For now, we support 9 inputs for metrics including both recommendation results and information of dataset
-which are listed below.
+    from recbole.evaluator.base_metric import AbstractMetric
+    class MyMetric(AbstractMetric):
+        def __init__(self, config):
+
+Set properties of the metric
+-----------------------------
+After that, we set the properties of metrics:
+
+Set ``metric_need``
+###################
+It is a list that contains one or multiple string that corresponding to needed input of metrics.
+For now, we support 9 inputs for metrics including both recommendation results and information of
+dataset which are listed below.
 
 ==================       ========================================================
  Notation                   Explanation
 ==================       ========================================================
   rec.items                        K recommended items for each user
-  rec.topk                        K recommended items and number of positive items for each user
+  rec.topk                         Boolean matrix indicating the existence of a recommended item in the test set
+                                   and number of positive items for each user
   rec.meanrank                        Mean ranking of positive items for each user
   rec.score                        Pure output score
   data.num_items                      Number of item in dataset
@@ -30,40 +40,40 @@ which are listed below.
   data.label                          Pure label field of input data (Usually used with rec.score together)
 ==================       ========================================================
 
-For example, if we want to add a metric named ``YourMetric`` which need the recommended items
-and the total item number, we can sign in the metric as follow.
+Set ``metric_type``
+###################
+It indicates whether the scores required by metric are grouped by user,
+range in ``EvaluatorType.RANKING`` (for grouped ones) and ``EvaluatorType.VALUE`` (for non-grouped ones).
+In current RecBole, all the "grouped" metrics are ranking-based and all the "non-grouped"
+metrics are value-based. To keep with our paper, we adopted the more formal terms: ``RANKING`` and ``VALUE``.
 
-.. code:: python
+Set ``smaller``
+###############
+It indicates whether the smaller metric value represents better performance, range in
+``True`` and ``False``,  default to ``False``.
 
-    metric_information = {
-    'ndcg': ['rec.topk'],  # Sign in for topk ranking metrics
-    'mrr': ['rec.topk'],
-    'map': ['rec.topk'],
-
-    'itemcoverage': ['rec.items', 'data.num_items'],  # Sign in for topk non-accuracy metrics
-
-    'yourmetric': ['rec.items', 'data.num_items'] # Sign in your customized metric
-    }
-
-
-Create a New Metric Class
---------------------------
-Then, we create a new class in the file :file:`~recbole.evaluator.metrics` and define the parameter in
-``__init__()``
+Example
+#######
+If we want to add a ranking-based metric named ``YourMetric`` which needs the recommended items and the
+total item number, and the smaller ``YourMetric`` indicates better model performance, the code is shown below:
 
 .. code:: python
 
     from recbole.evaluator.base_metric import AbstractMetric
+    from recbole.utils import EvaluatorType
     class MyMetric(AbstractMetric):
-        def __init__(self, config):
+        metric_type = EvaluatorType.RANKING
+        metric_need = ['rec.items', 'data.num_items']
+        smaller = True
 
+        def __init__(self, config):
 
 Implement calculate_metric(self, dataobject)
 ---------------------------------------------
 All the computational process is defined in this function. The args is a packaged data object that
 contains all the result above. We can treat it as a dict and get data from it by
 ``rec_items = dataobject.get('rec.items')`` . The returned value should be a dict with key of metric name
-and value of final result.
+and value of final result. Note that the metric name should be lowercase.
 
 Example code:
 
@@ -76,7 +86,7 @@ Example code:
             dataobject(DataStruct): it contains all the information needed to calculate metrics.
 
         Returns:
-            dict: such as ``{'Mymetric@10': 3153, 'MyMetric@20': 0.3824}``
+            dict: such as ``{'mymetric@10': 3153, 'mymetric@20': 0.3824}``
         """
         rec_items = dataobject.get('rec.items')
         # Add the logic of your metric here.

diff --git a/recbole/config/configurator.py b/recbole/config/configurator.py
@@ -19,7 +19,7 @@
 import torch
 from logging import getLogger
 
-from recbole.evaluator import rank_metrics, value_metrics
+from recbole.evaluator import metric_types, smaller_metrics
 from recbole.utils import get_model, Enum, EvaluatorType, ModelType, InputType, \
     general_arguments, training_arguments, evaluation_arguments, dataset_arguments, set_color
 
@@ -289,27 +289,22 @@ def _set_default_parameters(self):
         if isinstance(metrics, str):
             self.final_config_dict['metrics'] = [metrics]
 
-        eval_type = None
+        eval_type = set()
         for metric in self.final_config_dict['metrics']:
-            if metric.lower() in value_metrics:
-                if eval_type is not None and eval_type == EvaluatorType.RANKING:
-                    raise RuntimeError('Ranking metrics and other metrics can not be used at the same time.')
-                else:
-                    eval_type = EvaluatorType.VALUE
-            if metric.lower() in rank_metrics:
-                if eval_type is not None and eval_type == EvaluatorType.VALUE:
-                    raise RuntimeError('Ranking metrics and other metrics can not be used at the same time.')
-                else:
-                    eval_type = EvaluatorType.RANKING
-        self.final_config_dict['eval_type'] = eval_type
+            if metric.lower() in metric_types:
+                eval_type.add(metric_types[metric.lower()])
+            else:
+                raise NotImplementedError(f"There is no metric named '{metric}'")
+        if len(eval_type) > 1:
+            raise RuntimeError('Ranking metrics and value metrics can not be used at the same time.')
+        self.final_config_dict['eval_type'] = eval_type.pop()
 
         if self.final_config_dict['MODEL_TYPE'] == ModelType.SEQUENTIAL and not self.final_config_dict['repeatable']:
             raise ValueError('Sequential models currently only support repeatable recommendation, '
                              'please set `repeatable` as `True`.')
 
-        smaller_metric = ['rmse', 'mae', 'logloss', 'averagepopularity', 'giniindex']
         valid_metric = self.final_config_dict['valid_metric'].split('@')[0]
-        self.final_config_dict['valid_metric_bigger'] = False if valid_metric.lower() in smaller_metric else True
+        self.final_config_dict['valid_metric_bigger'] = False if valid_metric.lower() in smaller_metrics else True
 
         topk = self.final_config_dict['topk']
         if isinstance(topk, (int, list)):
@@ -342,6 +337,10 @@ def _set_default_parameters(self):
             if op_args not in self.final_config_dict['eval_args']:
                 self.final_config_dict['eval_args'][op_args] = default_eval_args[op_args]
 
+        if (self.final_config_dict['eval_args']['mode'] == 'full'
+                and self.final_config_dict['eval_type'] == EvaluatorType.VALUE):
+            raise NotImplementedError('Full sort evaluation do not match value-based metrics!')
+
     def _init_device(self):
         use_gpu = self.final_config_dict['use_gpu']
         if use_gpu:

diff --git a/recbole/evaluator/base_metric.py b/recbole/evaluator/base_metric.py
@@ -3,7 +3,7 @@
 # @email  : tsotfsk@outlook.com
 
 # UPDATE
-# @Time   : 2020/10/21, 2021/7/18
+# @Time   : 2020/10/21, 2021/8/29
 # @Author : Kaiyuan Li, Zhichao Feng
 # @email  : tsotfsk@outlook.com, fzcbupt@gmail.com
 
@@ -13,6 +13,7 @@
 """
 
 import torch
+from recbole.utils import EvaluatorType
 
 
 class AbstractMetric(object):
@@ -22,6 +23,8 @@ class AbstractMetric(object):
     Args:
         config (Config): the config of evaluator.
     """
+    smaller = False
+
     def __init__(self, config):
         self.decimal_place = config['metric_decimal_place']
 
@@ -32,7 +35,7 @@ def calculate_metric(self, dataobject):
             dataobject(DataStruct): it contains all the information needed to calculate metrics.
 
         Returns:
-            dict: such as ``{'Hit@10': 3153, 'Hit@20': 0.3824}``
+            dict: such as ``{'metric@10': 3153, 'metric@20': 0.3824}``
         """
         raise NotImplementedError('Method [calculate_metric] should be implemented.')
 
@@ -44,12 +47,16 @@ class TopkMetric(AbstractMetric):
     Args:
         config (Config): The config of evaluator.
     """
+    metric_type = EvaluatorType.RANKING
+    metric_need = ['rec.topk']
+
     def __init__(self, config):
         super().__init__(config)
         self.topk = config['topk']
 
     def used_info(self, dataobject):
-        """Get the bool matrix indicating whether the corresponding item is positive.
+        """Get the bool matrix indicating whether the corresponding item is positive
+        and number of positive items for each user.
         """
         rec_mat = dataobject.get('rec.topk')
         topk_idx, pos_len_list = torch.split(rec_mat, [max(self.topk), 1], dim=1)
@@ -60,7 +67,7 @@ def topk_result(self, metric, value):
 
         Args:
             metric(str): the name of calculated metric.
-            value(np.array): metrics for each user, including values from `metric@1` to `metric@max(self.topk)`.
+            value(numpy.ndarray): metrics for each user, including values from `metric@1` to `metric@max(self.topk)`.
 
         Returns:
             dict: metric values required in the configuration.
@@ -76,12 +83,12 @@ def metric_info(self, pos_index, pos_len=None):
         """Calculate the value of the metric.
 
         Args:
-            pos_index(np.array): a bool matrix, shape like ``n_users * max(topk)``. Item with the j-th highest score \
-            of i-th user is positive if ``pos_index[i][j] = True`` otherwise negative.
-            pos_len(np.array): a vector representing the number of positive items per user, shape like ``(n_users,)``.
+            pos_index(numpy.ndarray): a bool matrix, shape of ``n_users * max(topk)``. The item with the (j+1)-th \
+            highest score of i-th user is positive if ``pos_index[i][j] == True`` and negative otherwise.
+            pos_len(numpy.ndarray): a vector representing the number of positive items per user, shape of ``(n_users,)``.
 
         Returns:
-            np.array: metrics for each user, including values from `metric@1` to `metric@max(self.topk)`.
+            numpy.ndarray: metrics for each user, including values from `metric@1` to `metric@max(self.topk)`.
         """
         raise NotImplementedError('Method [metric_info] of top-k metric should be implemented.')
 
@@ -93,6 +100,9 @@ class LossMetric(AbstractMetric):
     Args:
         config (Config): The config of evaluator.
     """
+    metric_type = EvaluatorType.VALUE
+    metric_need = ['rec.score', 'data.label']
+
     def __init__(self, config):
         super().__init__(config)
 
@@ -112,8 +122,8 @@ def metric_info(self, preds, trues):
         """Calculate the value of the metric.
 
         Args:
-            preds (np.array): the scores predicted by model, a one-dimensional vector.
-            trues (np.array): the label of items, which has the same shape as ``preds``.
+            preds (numpy.ndarray): the scores predicted by model, a one-dimensional vector.
+            trues (numpy.ndarray): the label of items, which has the same shape as ``preds``.
 
         Returns:
             float: The value of the metric.

diff --git a/recbole/evaluator/evaluator.py b/recbole/evaluator/evaluator.py
@@ -8,9 +8,8 @@
 #####################################
 """
 
-from recbole.evaluator.metrics import metrics_dict
+from recbole.evaluator.register import metrics_dict
 from recbole.evaluator.collector import DataStruct
-from recbole.evaluator.register import value_metrics
 
 
 class Evaluator(object):
@@ -20,7 +19,6 @@ class Evaluator(object):
     def __init__(self, config):
         self.config = config
         self.metrics = [metric.lower() for metric in self.config['metrics']]
-        self._check_args()
         self.metric_class = {}
 
         for metric in self.metrics:
@@ -33,7 +31,7 @@ def evaluate(self, dataobject: DataStruct):
             dataobject (DataStruct): It contains all the information needed for metrics.
 
         Returns:
-            dict: such as ``{'Hit@20': 0.3824, 'Recall@20': 0.0527, 'Hit@10': 0.3153, 'GAUC': 0.9236}``
+            dict: such as ``{'hit@20': 0.3824, 'recall@20': 0.0527, 'hit@10': 0.3153, 'recall@10': 0.0329, 'gauc': 0.9236}``
 
         """
         result_dict = {}
@@ -42,9 +40,3 @@ def evaluate(self, dataobject: DataStruct):
             result_dict.update(metric_val)
         return result_dict
 
-    def _check_args(self):
-        # Check Loss
-        if set(self.metrics) & set(value_metrics):
-            is_full = 'full' in self.config['eval_args']['mode']
-            if is_full:
-                raise NotImplementedError('Full sort evaluation do not match the metrics!')