Merge pull request #32 from jegorus/metrics

Metrics: F1Beta, MCC
MobileTeleSystems · Mar 30, 2023 · 390c616 · 390c616
2 parents 39629df + 4911a9a
commit 390c616
Show file tree

Hide file tree

Showing 3 changed files with 125 additions and 3 deletions.
diff --git a/rectools/metrics/__init__.py b/rectools/metrics/__init__.py
@@ -23,7 +23,9 @@
 -------
 `metrics.Precision`
 `metrics.Recall`
+`metrics.F1Beta`
 `metrics.Accuracy`
+`metrics.MCC`
 `metrics.MAP`
 `metrics.NDCG`
 `metrics.MRR`
@@ -39,7 +41,7 @@
 `metrics.SparsePairwiseHammingDistanceCalculator`
 """
 
-from .classification import Accuracy, Precision, Recall
+from .classification import MCC, Accuracy, F1Beta, Precision, Recall
 from .distances import (
     PairwiseDistanceCalculator,
     PairwiseHammingDistanceCalculator,
@@ -54,7 +56,9 @@
 __all__ = (
     "Precision",
     "Recall",
+    "F1Beta",
     "Accuracy",
+    "MCC",
     "MAP",
     "NDCG",
     "MRR",

diff --git a/rectools/metrics/classification.py b/rectools/metrics/classification.py
@@ -18,6 +18,7 @@
 from collections import defaultdict
 
 import attr
+import numpy as np
 import pandas as pd
 
 from rectools import Columns
@@ -424,3 +425,75 @@ def make_confusions(reco: pd.DataFrame, interactions: pd.DataFrame, k: int) -> p
     merged = merge_reco(reco, interactions)
     confusion_df = calc_confusions(merged, k)
     return confusion_df
+
+
+@attr.s
+class F1Beta(SimpleClassificationMetric):
+    """
+    Fbeta score for k first recommendations.
+    See more: https://en.wikipedia.org/wiki/F-score
+
+    The f1_beta equals to ``(1 + beta_sqr) * p@k * r@k / (beta_sqr * p@k + r@k)``
+    where
+        - beta_sqr equals to beta ** 2
+        - p@k: precision@k equals to ``tp / k`` where
+            -``tp`` is the number of relevant recommendations
+                among first ``k`` items in the top of recommendation list.
+        - r@k: recall@k equals to ``tp / liked`` where
+            - ``tp`` is the number of relevant recommendations
+                among first ``k`` items in the top of recommendation list;
+            - ``liked`` is the number of items the user has interacted
+                (bought, liked) with (in period after recommendations were given).
+
+    Parameters
+    ----------
+    k : int
+        Number of items in top of recommendations list that will be used to calculate metric.
+    beta : float
+        Weight of recall. Default value: beta = 1.0
+    """
+
+    beta: float = attr.ib(default=1.0)
+
+    def _calc_per_user_from_confusion_df(self, confusion_df: pd.DataFrame) -> pd.Series:
+        beta_sqr = self.beta**2
+        p_k = confusion_df[TP] / self.k
+        r_k = confusion_df[TP] / confusion_df[LIKED]
+
+        f1 = (1 + beta_sqr) * p_k * r_k / (beta_sqr * p_k + r_k)
+        f1.loc[(p_k == 0.0) & (r_k == 0.0)] = 0.0
+        return f1
+
+
+@attr.s
+class MCC(ClassificationMetric):
+    """
+    Matthew correlation coefficient calculates correlation between actual and predicted classification.
+    Min value = -1 (negative correlation), Max value = 1 (positive correlation), zero means no correlation
+    See more: https://en.wikipedia.org/wiki/Phi_coefficient
+
+    The MCC equals to ``(tp * tn - fp * fn) / sqrt((tp + fp)(tp + fn)(tn + fp)(tn + fn))`` where
+        - ``tp`` is the number of relevant recommendations
+          among the first ``k`` items in recommendation list;
+        - ``tn`` is the number of items with which user has not interacted (bought, liked) with
+          (in period after recommendations were given) and we do not recommend to him
+          (in the top ``k`` items of recommendation list);
+        - ``fp`` - number of non-relevant recommendations among the first `k` items of recommendation list;
+        - ``fn`` - number of items the user has interacted with but that weren't recommended (in top-`k`).
+
+    Parameters
+    ----------
+    k : int
+        Number of items in top of recommendations list that will be used to calculate metric.
+    """
+
+    def _calc_per_user_from_confusion_df(self, confusion_df: pd.DataFrame, catalog: Catalog) -> pd.Series:
+        tp_ = confusion_df[TP]
+        tn_ = confusion_df[TN]
+        fp_ = confusion_df[FP]
+        fn_ = confusion_df[FN]
+        mcc_numerator = tp_ * tn_ - fp_ * fn_
+        mcc_denominator = np.sqrt((tp_ + fp_) * (tp_ + fn_) * (tn_ + fp_) * (tn_ + fn_))
+        mcc = mcc_numerator / mcc_denominator
+        mcc.loc[mcc_denominator == 0.0] = 0.0  # if denominator == 0 than numerator is also equals 0
+        return mcc
diff --git a/tests/metrics/test_classification.py b/tests/metrics/test_classification.py
@@ -19,7 +19,7 @@
 import pytest
 
 from rectools import Columns
-from rectools.metrics import Accuracy, Precision, Recall
+from rectools.metrics import MCC, Accuracy, F1Beta, Precision, Recall
 from rectools.metrics.base import MetricAtK
 from rectools.metrics.classification import ClassificationMetric, calc_classification_metrics
 
@@ -46,7 +46,7 @@ def setup(self) -> None:
 
     def test_calc(self) -> None:
         expected_metric_per_user = pd.Series(
-            [0.5, 0.5, 0, 0],
+            [0.5, 0.5, 0.0, 0.0],
             index=pd.Series([1, 3, 4, 5], name=Columns.User),
         )
         pd.testing.assert_series_equal(self.metric.calc_per_user(RECO, INTERACTIONS), expected_metric_per_user)
@@ -110,3 +110,48 @@ def test_raises_when_no_catalog_set_when_needed(self) -> None:
         metric = ClassificationMetric(k=1)
         with pytest.raises(ValueError):
             calc_classification_metrics({"m": metric}, pd.DataFrame(columns=[Columns.User, Columns.Item, Columns.Rank]))
+
+
+class TestF1Beta:
+    def setup(self) -> None:
+        self.metric = F1Beta(k=2, beta=2 ** (1 / 2))
+
+    def test_calc(self) -> None:
+        expected_metric_per_user = pd.Series(
+            [0.375, 0.75, 0, 0],
+            index=pd.Series([1, 3, 4, 5], name=Columns.User),
+        )
+        pd.testing.assert_series_equal(self.metric.calc_per_user(RECO, INTERACTIONS), expected_metric_per_user)
+        assert self.metric.calc(RECO, INTERACTIONS) == expected_metric_per_user.mean()
+
+    def test_when_no_interactions(self) -> None:
+        expected_metric_per_user = pd.Series(index=pd.Series(name=Columns.User, dtype=int), dtype=np.float64)
+        pd.testing.assert_series_equal(self.metric.calc_per_user(RECO, EMPTY_INTERACTIONS), expected_metric_per_user)
+        assert np.isnan(self.metric.calc(RECO, EMPTY_INTERACTIONS))
+
+
+class TestMCC:
+    def setup(self) -> None:
+        self.metric = MCC(k=2)
+
+    def test_calc(self) -> None:
+
+        # tp = pd.Series([1, 1, 0, 0])
+        # tn = pd.Series([6, 8, 7, 7])
+        # fp = pd.Series([1, 1, 2, 2])
+        # fn = pd.Series([2, 0, 1, 1])
+
+        expected_metric_per_user = pd.Series(
+            [1 / (21 ** (1 / 2)), 2 / 3, -1 / 6, -1 / 6],
+            index=pd.Series([1, 3, 4, 5], name=Columns.User),
+        )
+        pd.testing.assert_series_equal(self.metric.calc_per_user(RECO, INTERACTIONS, CATALOG), expected_metric_per_user)
+        assert self.metric.calc(RECO, INTERACTIONS, CATALOG) == expected_metric_per_user.mean()
+
+    def test_when_no_interactions(self) -> None:
+        expected_metric_per_user = pd.Series(index=pd.Series(name=Columns.User, dtype=int), dtype=np.float64)
+        pd.testing.assert_series_equal(
+            self.metric.calc_per_user(RECO, EMPTY_INTERACTIONS, CATALOG),
+            expected_metric_per_user,
+        )
+        assert np.isnan(self.metric.calc(RECO, EMPTY_INTERACTIONS, CATALOG))