From 4d94d8ebd5959e52e36c05f3d37ebf9a120c86a3 Mon Sep 17 00:00:00 2001 From: Andrey Semenov <43339130+In48semenov@users.noreply.github.com> Date: Mon, 5 Aug 2024 21:59:08 +0300 Subject: [PATCH] Feature/debias wrapper (#152) Added DebiasWrapper for metrics --- CHANGELOG.md | 2 + rectools/metrics/__init__.py | 5 + rectools/metrics/auc.py | 57 +++++-- rectools/metrics/classification.py | 99 ++++++++---- rectools/metrics/debias.py | 205 ++++++++++++++++++++++++ rectools/metrics/ranking.py | 81 +++++++--- rectools/metrics/scoring.py | 4 +- rectools/utils/misc.py | 2 +- tests/metrics/test_auc.py | 94 +++++++++++ tests/metrics/test_classification.py | 172 ++++++++++++++++++-- tests/metrics/test_debias.py | 230 +++++++++++++++++++++++++++ tests/metrics/test_ranking.py | 83 +++++++++- tests/metrics/test_scoring.py | 53 ++++++ 13 files changed, 1008 insertions(+), 79 deletions(-) create mode 100644 rectools/metrics/debias.py create mode 100644 tests/metrics/test_debias.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 78a4d44b..f7e3ca6d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +### Added +- `Debias` mechanism for classification, ranking and auc metrics. New parameter `is_debiased` to `calc_from_confusion_df`, `calc_per_user_from_confusion_df` methods of classification metrics, `calc_from_fitted`, `calc_per_user_from_fitted` methods of auc and rankning (`MAP`) metrics, `calc_from_merged`, `calc_per_user_from_merged` methods of ranking (`NDCG`, `MRR`) metrics. ([#152](https://github.com/MobileTeleSystems/RecTools/pull/152)) ## [0.7.0] - 29.07.2024 diff --git a/rectools/metrics/__init__.py b/rectools/metrics/__init__.py index a8516e77..3844610b 100644 --- a/rectools/metrics/__init__.py +++ b/rectools/metrics/__init__.py @@ -47,10 +47,13 @@ `metrics.PairwiseDistanceCalculator` `metrics.PairwiseHammingDistanceCalculator` `metrics.SparsePairwiseHammingDistanceCalculator` +`metrics.DebiasConfig` +`metrics.debias_interactions` """ from .auc import PAP, PartialAUC from .classification import MCC, Accuracy, F1Beta, HitRate, Precision, Recall +from .debias import DebiasConfig, debias_interactions from .distances import ( PairwiseDistanceCalculator, PairwiseHammingDistanceCalculator, @@ -89,4 +92,6 @@ "SufficientReco", "UnrepeatedReco", "CoveredUsers", + "DebiasConfig", + "debias_interactions", ) diff --git a/rectools/metrics/auc.py b/rectools/metrics/auc.py index c3da0516..df9b7cc7 100644 --- a/rectools/metrics/auc.py +++ b/rectools/metrics/auc.py @@ -21,7 +21,8 @@ from attrs import define, field from rectools import Columns -from rectools.metrics.base import MetricAtK, outer_merge_reco +from rectools.metrics.base import outer_merge_reco +from rectools.metrics.debias import DebiasableMetrikAtK, calc_debiased_fit_task, debias_interactions class InsufficientHandling(str, Enum): @@ -58,7 +59,7 @@ class AUCFitted: @define -class _AUCMetric(MetricAtK): +class _AUCMetric(DebiasableMetrikAtK): """ ROC AUC based metric base class. @@ -88,6 +89,8 @@ class _AUCMetric(MetricAtK): until the model has non-zero scores for the item in item-item similarity matrix. So with small `K` for neighbours in ItemKNN and big `K` for `recommend` and AUC based metric you will still get an error when `insufficient_handling` is set to `raise`. + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). """ insufficient_handling: str = field(default="ignore") @@ -217,12 +220,17 @@ def calc_per_user(self, reco: pd.DataFrame, interactions: pd.DataFrame) -> pd.Se pd.Series Values of metric (index - user id, values - metric value for every user). """ + is_debiased = False + if self.debias_config is not None: + interactions = debias_interactions(interactions, self.debias_config) + is_debiased = True + self._check(reco, interactions=interactions) insufficient_handling_needed = self.insufficient_handling != InsufficientHandling.IGNORE fitted = self.fit(reco, interactions, self.k, insufficient_handling_needed) - return self.calc_per_user_from_fitted(fitted) + return self.calc_per_user_from_fitted(fitted, is_debiased) - def calc_from_fitted(self, fitted: AUCFitted) -> float: + def calc_from_fitted(self, fitted: AUCFitted, is_debiased: bool = False) -> float: """ Calculate metric value from fitted data. @@ -230,16 +238,18 @@ def calc_from_fitted(self, fitted: AUCFitted) -> float: ---------- fitted : AUCFitted Meta data that got from `.fit` method. + is_debiased : bool, default False + An indicator of whether the debias transformation has been applied before or not. Returns ------- float Value of metric (average between users). """ - per_user = self.calc_per_user_from_fitted(fitted) + per_user = self.calc_per_user_from_fitted(fitted, is_debiased) return per_user.mean() - def calc_per_user_from_fitted(self, fitted: AUCFitted) -> pd.Series: + def calc_per_user_from_fitted(self, fitted: AUCFitted, is_debiased: bool = False) -> pd.Series: """ Calculate metric values for all users from from fitted data. @@ -247,6 +257,8 @@ def calc_per_user_from_fitted(self, fitted: AUCFitted) -> pd.Series: ---------- fitted : AUCFitted Meta data that got from `.fit` method. + is_debiased : bool, default False + An indicator of whether the debias transformation has been applied before or not. Returns ------- @@ -307,6 +319,8 @@ class PartialAUC(_AUCMetric): until the model has non-zero scores for the item in item-item similarity matrix. So with small `K` for neighbours in ItemKNN and big `K` for `recommend` and AUC based metric you will still get an error when `insufficient_handling` is set to `raise`. + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). Examples -------- @@ -339,7 +353,7 @@ def _get_sufficient_reco_explanation(self) -> str: not too high. """ - def calc_per_user_from_fitted(self, fitted: AUCFitted) -> pd.Series: + def calc_per_user_from_fitted(self, fitted: AUCFitted, is_debiased: bool = False) -> pd.Series: """ Calculate metric values for all users from from fitted data. @@ -347,17 +361,18 @@ def calc_per_user_from_fitted(self, fitted: AUCFitted) -> pd.Series: ---------- fitted : AUCFitted Meta data that got from `.fit` method. + is_debiased : bool, default False + An indicator of whether the debias transformation has been applied before or not. Returns ------- pd.Series Values of metric (index - user id, values - metric value for every user). """ + self._check_debias(is_debiased, obj_name="AUCFitted") outer_merged = fitted.outer_merged_enriched - # Keep k first false positives for roc auc computation, keep all predicted test positives cropped = outer_merged[(outer_merged["__fp_cumsum"] < self.k) & (~outer_merged[Columns.Rank].isna())] - cropped_suf, n_pos_suf = self._handle_insufficient_cases( outer_merged=cropped, n_pos=fitted.n_pos, n_fp_insufficient=fitted.n_fp_insufficient ) @@ -415,6 +430,8 @@ class PAP(_AUCMetric): until the model has non-zero scores for the item in item-item similarity matrix. So with small `K` for neighbours in ItemKNN and big `K` for `recommend` and AUC based metric you will still get an error when `insufficient_handling` is set to `raise`. + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). Examples -------- @@ -447,7 +464,7 @@ def _get_sufficient_reco_explanation(self) -> str: for all users. """ - def calc_per_user_from_fitted(self, fitted: AUCFitted) -> pd.Series: + def calc_per_user_from_fitted(self, fitted: AUCFitted, is_debiased: bool = False) -> pd.Series: """ Calculate metric values for all users from outer merged recommendations. @@ -455,14 +472,16 @@ def calc_per_user_from_fitted(self, fitted: AUCFitted) -> pd.Series: ---------- fitted : AUCFitted Meta data that got from `.fit` method. + is_debiased : bool, default False + An indicator of whether the debias transformation has been applied before or not. Returns ------- pd.Series Values of metric (index - user id, values - metric value for every user). """ + self._check_debias(is_debiased, obj_name="AUCFitted") outer_merged = fitted.outer_merged_enriched - # Keep k first false positives and k first predicted test positives for roc auc computation cropped = outer_merged[ (outer_merged["__test_pos_cumsum"] <= self.k) @@ -513,12 +532,22 @@ def calc_auc_metrics( """ results = {} - k_max = max(metric.k for metric in metrics.values()) insufficient_handling_needed = any( metric.insufficient_handling != InsufficientHandling.IGNORE for metric in metrics.values() ) - fitted = _AUCMetric.fit(reco, interactions, k_max, insufficient_handling_needed) + + debiased_fit_task = calc_debiased_fit_task(metrics.values(), interactions) + fitted_debiased = {} + for debias_config_name, (k_max_d, interactions_d) in debiased_fit_task.items(): + fitted_debiased[debias_config_name] = _AUCMetric.fit( + reco, interactions_d, k_max_d, insufficient_handling_needed + ) + for name, metric in metrics.items(): - results[name] = metric.calc_from_fitted(fitted) + is_debiased = metric.debias_config is not None + results[name] = metric.calc_from_fitted( + fitted=fitted_debiased[metric.debias_config], + is_debiased=is_debiased, + ) return results diff --git a/rectools/metrics/classification.py b/rectools/metrics/classification.py index 507c31eb..e93c90f1 100644 --- a/rectools/metrics/classification.py +++ b/rectools/metrics/classification.py @@ -15,7 +15,6 @@ """Classification recommendations metrics.""" import typing as tp -from collections import defaultdict import attr import numpy as np @@ -23,7 +22,8 @@ from rectools import Columns -from .base import Catalog, MetricAtK, merge_reco +from .base import Catalog, merge_reco +from .debias import DebiasableMetrikAtK, debias_for_metric_configs, debias_interactions TP = "__TP" FP = "__FP" @@ -33,7 +33,7 @@ @attr.s -class ClassificationMetric(MetricAtK): +class ClassificationMetric(DebiasableMetrikAtK): """ Classification metric base class. @@ -44,6 +44,8 @@ class ClassificationMetric(MetricAtK): ---------- k : int Number of items at the top of recommendations list that will be used to calculate metric. + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). """ def calc(self, reco: pd.DataFrame, interactions: pd.DataFrame, catalog: Catalog) -> float: @@ -85,11 +87,16 @@ def calc_per_user(self, reco: pd.DataFrame, interactions: pd.DataFrame, catalog: pd.Series Values of metric (index - user id, values - metric value for every user). """ + is_debiased = False + if self.debias_config is not None: + interactions = debias_interactions(interactions, self.debias_config) + is_debiased = True + self._check(reco, interactions=interactions) confusion_df = make_confusions(reco, interactions, self.k) - return self.calc_per_user_from_confusion_df(confusion_df, catalog) + return self.calc_per_user_from_confusion_df(confusion_df, catalog, is_debiased) - def calc_from_confusion_df(self, confusion_df: pd.DataFrame, catalog: Catalog) -> float: + def calc_from_confusion_df(self, confusion_df: pd.DataFrame, catalog: Catalog, is_debiased: bool = False) -> float: """ Calculate metric value from prepared confusion matrix. @@ -102,16 +109,20 @@ def calc_from_confusion_df(self, confusion_df: pd.DataFrame, catalog: Catalog) - See its description for details. catalog : collection Collection of unique item ids that could be used for recommendations. + is_debiased : bool, default False + An indicator of whether the debias transformation has been applied before or not. Returns ------- float Value of metric (average between users). """ - per_user = self.calc_per_user_from_confusion_df(confusion_df, catalog) + per_user = self.calc_per_user_from_confusion_df(confusion_df, catalog, is_debiased) return per_user.mean() - def calc_per_user_from_confusion_df(self, confusion_df: pd.DataFrame, catalog: Catalog) -> pd.Series: + def calc_per_user_from_confusion_df( + self, confusion_df: pd.DataFrame, catalog: Catalog, is_debiased: bool = False + ) -> pd.Series: """ Calculate metric values for all users from prepared confusion matrix. @@ -124,12 +135,15 @@ def calc_per_user_from_confusion_df(self, confusion_df: pd.DataFrame, catalog: C See its description for details. catalog : collection Collection of unique item ids that could be used for recommendations. + is_debiased : bool, default False + An indicator of whether the debias transformation has been applied before or not. Returns ------- pd.Series Values of metric (index - user id, values - metric value for every user). """ + self._check_debias(is_debiased, obj_name="confusion_df") if TN not in confusion_df: confusion_df[TN] = len(catalog) - self.k - confusion_df[FN] return self._calc_per_user_from_confusion_df(confusion_df, catalog).rename(None) @@ -139,7 +153,7 @@ def _calc_per_user_from_confusion_df(self, confusion_df: pd.DataFrame, catalog: @attr.s -class SimpleClassificationMetric(MetricAtK): +class SimpleClassificationMetric(DebiasableMetrikAtK): """ Simple classification metric base class. @@ -150,6 +164,8 @@ class SimpleClassificationMetric(MetricAtK): ---------- k : int Number of items at the top of recommendations list that will be used to calculate metric. + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). """ def calc(self, reco: pd.DataFrame, interactions: pd.DataFrame) -> float: @@ -187,11 +203,16 @@ def calc_per_user(self, reco: pd.DataFrame, interactions: pd.DataFrame) -> pd.Se pd.Series Values of metric (index - user id, values - metric value for every user). """ + is_debiased = False + if self.debias_config is not None: + interactions = debias_interactions(interactions, self.debias_config) + is_debiased = True + self._check(reco, interactions=interactions) confusion_df = make_confusions(reco, interactions, self.k) - return self.calc_per_user_from_confusion_df(confusion_df) + return self.calc_per_user_from_confusion_df(confusion_df, is_debiased) - def calc_from_confusion_df(self, confusion_df: pd.DataFrame) -> float: + def calc_from_confusion_df(self, confusion_df: pd.DataFrame, is_debiased: bool = False) -> float: """ Calculate metric value from prepared confusion matrix. @@ -202,16 +223,18 @@ def calc_from_confusion_df(self, confusion_df: pd.DataFrame) -> float: Columns are: `Columns.User`, `LIKED`, `TP`, `FP`, `FN`. This table can be generated by `make_confusions` (or `calc_confusions`) function. See its description for details. + is_debiased : bool, default False + An indicator of whether the debias transformation has been applied before or not. Returns ------- float Value of metric (average between users). """ - per_user = self.calc_per_user_from_confusion_df(confusion_df) + per_user = self.calc_per_user_from_confusion_df(confusion_df, is_debiased) return per_user.mean() - def calc_per_user_from_confusion_df(self, confusion_df: pd.DataFrame) -> pd.Series: + def calc_per_user_from_confusion_df(self, confusion_df: pd.DataFrame, is_debiased: bool = False) -> pd.Series: """ Calculate metric values for all users from prepared confusion matrix. @@ -222,12 +245,15 @@ def calc_per_user_from_confusion_df(self, confusion_df: pd.DataFrame) -> pd.Seri Columns are: `Columns.User`, `LIKED`, `TP`, `FP`, `FN`. This table can be generated by `make_confusions` (or `calc_confusions`) function. See its description for details. + is_debiased : bool, default False + An indicator of whether the debias transformation has been applied before or not. Returns ------- pd.Series Values of metric (index - user id, values - metric value for every user). """ + self._check_debias(is_debiased, obj_name="confusion_df") return self._calc_per_user_from_confusion_df(confusion_df).rename(None) def _calc_per_user_from_confusion_df(self, confusion_df: pd.DataFrame) -> pd.Series: @@ -255,6 +281,8 @@ class Precision(SimpleClassificationMetric): Whether to calculate R-Precision instead of simple Precision. If `True` number of user true positives (`tp`) in recommendations will be divided by minimum of `k` and number of user test positives (`tp+fn`) instead of division by `k`. + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). """ r_precision: bool = attr.ib(default=False) @@ -280,6 +308,8 @@ class Recall(SimpleClassificationMetric): ---------- k : int Number of items in top of recommendations list that will be used to calculate metric. + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). """ def _calc_per_user_from_confusion_df(self, confusion_df: pd.DataFrame) -> pd.Series: @@ -303,6 +333,8 @@ class Accuracy(ClassificationMetric): ---------- k : int Number of items at the top of recommendations list that will be used to calculate metric. + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). """ def _calc_per_user_from_confusion_df(self, confusion_df: pd.DataFrame, catalog: Catalog) -> pd.Series: @@ -334,6 +366,8 @@ class F1Beta(SimpleClassificationMetric): Number of items in top of recommendations list that will be used to calculate metric. beta : float Weight of recall. Default value: beta = 1.0 + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). """ beta: float = attr.ib(default=1.0) @@ -368,6 +402,8 @@ class MCC(ClassificationMetric): ---------- k : int Number of items in top of recommendations list that will be used to calculate metric. + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). """ def _calc_per_user_from_confusion_df(self, confusion_df: pd.DataFrame, catalog: Catalog) -> pd.Series: @@ -395,6 +431,8 @@ class HitRate(SimpleClassificationMetric): ---------- k : int Number of items in top of recommendations list that will be used to calculate metric. + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). """ def _calc_per_user_from_confusion_df(self, confusion_df: pd.DataFrame) -> pd.Series: @@ -440,26 +478,25 @@ def calc_classification_metrics( TypeError If unexpected metric is present in `metrics`. """ - k_map = defaultdict(list) - for name, metric in metrics.items(): - k_map[metric.k].append(name) - results = {} - for k, k_metrics in k_map.items(): - confusion_df = calc_confusions(merged, k) - - for metric_name in k_metrics: - metric = metrics[metric_name] - if isinstance(metric, SimpleClassificationMetric): - res = metric.calc_from_confusion_df(confusion_df) - elif isinstance(metric, ClassificationMetric): - if catalog is None: - raise ValueError(f"For calculating '{metric.__class__.__name__}' it's necessary to set `catalog`") - res = metric.calc_from_confusion_df(confusion_df, catalog) - else: - raise TypeError(f"Unexpected classification metric {metric}") - results[metric_name] = res - + merged_debiased = debias_for_metric_configs(metrics.values(), merged) + + confusions = {} + for metric_name, metric in metrics.items(): + k, debias_config = metric.k, metric.debias_config + confusion_task = (k, debias_config) + is_debiased = debias_config is not None + if confusion_task not in confusions: + confusions[confusion_task] = calc_confusions(merged=merged_debiased[debias_config], k=k) + + confusion_df = confusions[confusion_task] + if isinstance(metric, SimpleClassificationMetric): + res = metric.calc_from_confusion_df(confusion_df, is_debiased=is_debiased) + elif isinstance(metric, ClassificationMetric): + if catalog is None: + raise ValueError(f"For calculating '{metric.__class__.__name__}' it's necessary to set `catalog`") + res = metric.calc_from_confusion_df(confusion_df, catalog, is_debiased=is_debiased) + results[metric_name] = res return results diff --git a/rectools/metrics/debias.py b/rectools/metrics/debias.py new file mode 100644 index 00000000..2981a75c --- /dev/null +++ b/rectools/metrics/debias.py @@ -0,0 +1,205 @@ +# Copyright 2024 MTS (Mobile Telesystems) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Debias module.""" + +import typing as tp +from collections import defaultdict + +import attr +import pandas as pd + +from rectools import Columns + +from .base import MetricAtK + + +@attr.s(frozen=True) +class DebiasConfig: + """ + Config for debiasing method parameters. + + Parameters + ---------- + iqr_coef : float, default 1.5 + The interquartile range (IQR) coefficient required to calculate the maximum accepted popularity border + (Q3 + iqr_coef * IQR), which is necessary to down-sample every item to a value that does not exceed it. + random_state : int, optional, default None + Pseudorandom number generator state to control the down-sampling. + """ + + iqr_coef: float = attr.ib(default=1.5) + random_state: tp.Optional[int] = attr.ib(default=None) + + +@attr.s +class DebiasableMetrikAtK(MetricAtK): + """ + Debiasing metric base class. + + Warning: This class should not be used directly. + Use derived classes instead. + + Parameters + ---------- + k : int + Number of items at the top of recommendations list that will be used to calculate metric. + debias_config : DebiasConfig, default None + Config with debias method parameters (iqr_coef, random_state). + """ + + debias_config: DebiasConfig = attr.ib(default=None) + + def _check_debias(self, is_debiased: bool, obj_name: str) -> None: + if not is_debiased and self.debias_config is not None: + raise ValueError( + "You have specified `debias_config` for metric " + f"but `{obj_name}` is not de-biased. " + f"Please make de-biasing for `{obj_name}` " + "and specify `is_debiased` as `True` " + "or otherwise use `calc` and `calc_per_user` methods for auto de-biasing." + ) + + +def debias_interactions(interactions: pd.DataFrame, config: DebiasConfig) -> pd.DataFrame: + """ + Down-sample the size of interactions, excluding some interactions with popular items. + + Algorithm: + + 1. Calculate item "popularity" + (here: number of unique users that had interaction with the item) distribution from interactions; + 2. Find first (Q1) and third (Q3) quartiles in items "popularity" distribution; + 3. Calculate interquartile range (IQR) = Q3 - Q1; + 4. Calculate maximum value inside by formula: Q3 + iqr_coef * IQR; + 5. Down-sample for all exceeding items in interactions, + randomly keeping the maximum group of users to a size not exceeding + maximum value inside + + Parameters + ---------- + interactions : pd.DataFrame + Table with previous user-item interactions, + with columns `Columns.User`, `Columns.Item`. + config : DebiasConfig + Config with debias method parameters (iqr_coef, random_state). + + Returns + ------- + pd.DataFrame + Downsampling interactions. + """ + if len(interactions) == 0: + return interactions + + interactions_for_debiasing = interactions.copy() + + num_users_interacted_with_item = interactions_for_debiasing.groupby(Columns.Item, sort=False)[ + Columns.User + ].nunique() + + quantiles = num_users_interacted_with_item.quantile(q=[0.25, 0.75]) + q1, q3 = quantiles.loc[0.25], quantiles.loc[0.75] + iqr = q3 - q1 + max_border = int(q3 + config.iqr_coef * iqr) + + item_outside_max_border = num_users_interacted_with_item[num_users_interacted_with_item > max_border].index + + mask_outside_max_border = interactions_for_debiasing[Columns.Item].isin(item_outside_max_border) + interactions_result = interactions_for_debiasing[~mask_outside_max_border] + interactions_downsampling = interactions_for_debiasing[mask_outside_max_border] + + interactions_downsampling = ( + interactions_downsampling.sample(frac=1.0, random_state=config.random_state) + .groupby(Columns.Item) + .head(max_border) + ) + + result_dfs = [interactions_result, interactions_downsampling] + interactions_result = pd.concat(result_dfs, ignore_index=True) + + return interactions_result + + +def calc_debiased_fit_task( + metrics: tp.Iterable[DebiasableMetrikAtK], + interactions: pd.DataFrame, + prev_debiased_interactions: tp.Optional[tp.Dict[DebiasConfig, pd.DataFrame]] = None, +) -> tp.Dict[DebiasConfig, tp.Tuple[int, pd.DataFrame]]: + """ + Calculate for each of the unique debias configs `k_max` and debiased `interactions` + to then apply them in the `fit` methods of the corresponding metrics. + + Parameters + ---------- + metrics : tp.Iteraple[DebiasableMetrikAtK] + Dict of metric objects to calculate, where key is metric name and value is metric object. + interactions : pd.DataFrame + Interactions or merging table with columns `Columns.User`, `Columns.Item`, `Columns.Rank` (for merging). + Obligatory only for some types of metrics. + prev_debiased_interactions : dict(DebiasConfig->pd.DataFrame]), optinonal + Debiased interactions for certain debias configs calculated earlier. + + Returns + ------- + dict(DebiasConfig->list[(int | pd.DataFrame)]) + Dictionary, where key is debias config + and values are a tuple of the corresponding `k_max` and debiased `interactions`. + """ + debiased_interactions = debias_for_metric_configs(metrics, interactions, prev_debiased_interactions) + + max_k_for_config: tp.Dict[DebiasConfig, int] = defaultdict(int) + for metric in metrics: + max_k_for_config[metric.debias_config] = max(max_k_for_config[metric.debias_config], metric.k) + + result = { + config: (max_k_for_config[config], d_interactions) for config, d_interactions in debiased_interactions.items() + } + return result + + +def debias_for_metric_configs( + metrics: tp.Iterable[DebiasableMetrikAtK], + interactions: pd.DataFrame, + prev_debiased_interactions: tp.Optional[tp.Dict[DebiasConfig, pd.DataFrame]] = None, +) -> tp.Dict[DebiasConfig, pd.DataFrame]: + """ + Calculate for each of the unique debias configs debiased `interactions`. + + Parameters + ---------- + metrics : tp.Iterable[DebiasableMetrikAtK] + List of metrics to calculate debiased differential metrics for. + interactions : pd.DataFrame + List of interactions to calculate debiased differential metrics for. + prev_debiased_interactions : dict(DebiasConfig->pd.DataFrame]), optinonal + Debiased interactions for certain debias configs calculated earlier. + + Returns + ------- + dict(DebiasConfig->pd.DataFrame]) + Dictionary, where key is debias config and values are debiased `interactions`. + """ + configs_new = set(metric.debias_config for metric in metrics) + if prev_debiased_interactions is not None: + configs_new -= set(prev_debiased_interactions.keys()) + + debiased_interactions = { + config: debias_interactions(interactions, config) if config is not None else interactions + for config in configs_new + } + if prev_debiased_interactions is not None: + debiased_interactions = {**prev_debiased_interactions, **debiased_interactions} + + return debiased_interactions diff --git a/rectools/metrics/ranking.py b/rectools/metrics/ranking.py index ede693d1..900c08c9 100644 --- a/rectools/metrics/ranking.py +++ b/rectools/metrics/ranking.py @@ -13,6 +13,7 @@ # limitations under the License. """Ranking recommendations metrics.""" + import typing as tp import attr @@ -21,12 +22,14 @@ from scipy import sparse from rectools import Columns -from rectools.metrics.base import MetricAtK, merge_reco +from rectools.metrics.base import merge_reco from rectools.utils import log_at_base, select_by_type +from .debias import DebiasableMetrikAtK, calc_debiased_fit_task, debias_for_metric_configs, debias_interactions + @attr.s -class _RankingMetric(MetricAtK): +class _RankingMetric(DebiasableMetrikAtK): """ Ranking metric base class. @@ -37,6 +40,8 @@ class _RankingMetric(MetricAtK): ---------- k : int Number of items at the top of recommendations list that will be used to calculate metric. + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). """ def calc(self, reco: pd.DataFrame, interactions: pd.DataFrame) -> float: @@ -129,6 +134,8 @@ class MAP(_RankingMetric): divide_by_k : bool, default False If ``True``, ``k`` will be used as divider in ``AP@k``. If ``False``, number of relevant items for each user will be used. + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). Examples -------- @@ -242,12 +249,17 @@ def calc_per_user(self, reco: pd.DataFrame, interactions: pd.DataFrame) -> pd.Se pd.Series Values of metric (index - user id, values - metric value for every user). """ + is_debiased = False + if self.debias_config is not None: + interactions = debias_interactions(interactions, self.debias_config) + is_debiased = True + self._check(reco, interactions=interactions) merged_reco = merge_reco(reco, interactions) fitted = self.fit(merged_reco, k_max=self.k) - return self.calc_per_user_from_fitted(fitted) + return self.calc_per_user_from_fitted(fitted, is_debiased) - def calc_per_user_from_fitted(self, fitted: MAPFitted) -> pd.Series: + def calc_per_user_from_fitted(self, fitted: MAPFitted, is_debiased: bool = False) -> pd.Series: """ Calculate metric values for all users from fitted data. @@ -257,12 +269,15 @@ def calc_per_user_from_fitted(self, fitted: MAPFitted) -> pd.Series: ---------- fitted : MAPFitted Meta data that got from `.fit` method. + is_debiased : bool, default False + An indicator of whether the debias transformation has been applied before or not. Returns ------- pd.Series Values of metric (index - user id, values - metric value for every user). """ + self._check_debias(is_debiased, obj_name="MAPFitted") valid_precisions = fitted.precision_at_k[:, 1 : self.k + 1] sum_precisions = np.asarray(valid_precisions.sum(axis=1)).reshape(-1) if self.divide_by_k: @@ -272,7 +287,7 @@ def calc_per_user_from_fitted(self, fitted: MAPFitted) -> pd.Series: avg_precisions = pd.Series(sum_precisions, index=pd.Series(fitted.users, name=Columns.User)).rename(None) return avg_precisions - def calc_from_fitted(self, fitted: MAPFitted) -> float: + def calc_from_fitted(self, fitted: MAPFitted, is_debiased: bool = False) -> float: """ Calculate metric value from fitted data. @@ -282,13 +297,15 @@ def calc_from_fitted(self, fitted: MAPFitted) -> float: ---------- fitted : MAPFitted Meta data that got from `.fit` method. + is_debiased : bool, default False + An indicator of whether the debias transformation has been applied before or not. Returns ------- float Value of metric (average between users). """ - per_user = self.calc_per_user_from_fitted(fitted) + per_user = self.calc_per_user_from_fitted(fitted, is_debiased) return per_user.mean() @@ -317,10 +334,12 @@ class NDCG(_RankingMetric): Parameters ---------- - k : int + k : int Number of items at the top of recommendations list that will be used to calculate metric. - log_base : int, default ``2`` + log_base : int, default ``2`` Base of logarithm used to weight relevant items. + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). Examples -------- @@ -370,7 +389,7 @@ def calc_per_user(self, reco: pd.DataFrame, interactions: pd.DataFrame) -> pd.Se merged_reco = merge_reco(reco, interactions) return self.calc_per_user_from_merged(merged_reco) - def calc_from_merged(self, merged: pd.DataFrame) -> float: + def calc_from_merged(self, merged: pd.DataFrame, is_debiased: bool = False) -> float: """ Calculate metric value from merged recommendations. @@ -379,16 +398,18 @@ def calc_from_merged(self, merged: pd.DataFrame) -> float: merged : pd.DataFrame Result of merging recommendations and interactions tables. Can be obtained using `merge_reco` function. + is_debiased : bool, default False + An indicator of whether the debias transformation has been applied before or not. Returns ------- float Value of metric (average between users). """ - per_user = self.calc_per_user_from_merged(merged) + per_user = self.calc_per_user_from_merged(merged, is_debiased) return per_user.mean() - def calc_per_user_from_merged(self, merged: pd.DataFrame) -> pd.Series: + def calc_per_user_from_merged(self, merged: pd.DataFrame, is_debiased: bool = False) -> pd.Series: """ Calculate metric values for all users from merged recommendations. @@ -397,12 +418,17 @@ def calc_per_user_from_merged(self, merged: pd.DataFrame) -> pd.Series: merged : pd.DataFrame Result of merging recommendations and interactions tables. Can be obtained using `merge_reco` function. + is_debiased : bool, default False + An indicator of whether the debias transformation has been applied before or not. Returns ------- pd.Series Values of metric (index - user id, values - metric value for every user). """ + if not is_debiased and self.debias_config is not None: + merged = debias_interactions(merged, self.debias_config) + dcg = (merged[Columns.Rank] <= self.k).astype(int) / log_at_base(merged[Columns.Rank] + 1, self.log_base) idcg = (1 / log_at_base(np.arange(1, self.k + 1) + 1, self.log_base)).sum() ndcg = ( @@ -435,8 +461,10 @@ class MRR(_RankingMetric): Parameters ---------- - k : int + k : int Number of items at the top of recommendations list that will be used to calculate metric. + debias_config : DebiasConfig, optional, default None + Config with debias method parameters (iqr_coef, random_state). Examples -------- @@ -484,7 +512,7 @@ def calc_per_user(self, reco: pd.DataFrame, interactions: pd.DataFrame) -> pd.Se merged_reco = merge_reco(reco, interactions) return self.calc_per_user_from_merged(merged_reco) - def calc_per_user_from_merged(self, merged: pd.DataFrame) -> pd.Series: + def calc_per_user_from_merged(self, merged: pd.DataFrame, is_debiased: bool = False) -> pd.Series: """ Calculate metric values for all users from merged recommendations. @@ -493,12 +521,17 @@ def calc_per_user_from_merged(self, merged: pd.DataFrame) -> pd.Series: merged : pd.DataFrame Result of merging recommendations and interactions tables. Can be obtained using `merge_reco` function. + is_debiased : bool, default False + An indicator of whether the debias transformation has been applied before or not. Returns ------- pd.Series Values of metric (index - user id, values - metric value for every user). """ + if not is_debiased and self.debias_config is not None: + merged = debias_interactions(merged, self.debias_config) + cutted_rank = np.where(merged[Columns.Rank] <= self.k, merged[Columns.Rank], np.nan) min_rank_per_user = ( pd.DataFrame({Columns.User: merged[Columns.User], "__cutted_rank": cutted_rank}) @@ -507,7 +540,7 @@ def calc_per_user_from_merged(self, merged: pd.DataFrame) -> pd.Series: ) return (1.0 / min_rank_per_user).fillna(0).rename(None) - def calc_from_merged(self, merged: pd.DataFrame) -> float: + def calc_from_merged(self, merged: pd.DataFrame, is_debiased: bool = False) -> float: """ Calculate metric value from merged recommendations. @@ -516,13 +549,15 @@ def calc_from_merged(self, merged: pd.DataFrame) -> float: merged : pd.DataFrame Result of merging recommendations and interactions tables. Can be obtained using `merge_reco` function. + is_debiased : bool, default False + An indicator of whether the debias transformation has been applied before or not. Returns ------- float Value of metric (average between users). """ - per_user = self.calc_per_user_from_merged(merged) + per_user = self.calc_per_user_from_merged(merged, is_debiased) return per_user.mean() @@ -558,16 +593,24 @@ def calc_ranking_metrics( """ results = {} + merged_debiased = None for ranking_metric_cls in [NDCG, MRR]: ranking_metrics: tp.Dict[str, tp.Union[NDCG, MRR]] = select_by_type(metrics, ranking_metric_cls) + merged_debiased = debias_for_metric_configs(ranking_metrics.values(), merged) for name, metric in ranking_metrics.items(): - results[name] = metric.calc_from_merged(merged) + results[name] = metric.calc_from_merged(merged_debiased[metric.debias_config], is_debiased=True) map_metrics: tp.Dict[str, MAP] = select_by_type(metrics, MAP) if map_metrics: - k_max = max(metric.k for metric in map_metrics.values()) - fitted = MAP.fit(merged, k_max) + debiased_fit_task = calc_debiased_fit_task(map_metrics.values(), merged, merged_debiased) + fitted_debiased = {} + for debias_config, (k_max_d, merged_d) in debiased_fit_task.items(): + fitted_debiased[debias_config] = MAP.fit(merged_d, k_max_d) + for name, map_metric in map_metrics.items(): - results[name] = map_metric.calc_from_fitted(fitted) + is_debiased = map_metric.debias_config is not None + results[name] = map_metric.calc_from_fitted( + fitted=fitted_debiased[map_metric.debias_config], is_debiased=is_debiased + ) return results diff --git a/rectools/metrics/scoring.py b/rectools/metrics/scoring.py index ca7307f1..91d370b1 100644 --- a/rectools/metrics/scoring.py +++ b/rectools/metrics/scoring.py @@ -34,7 +34,7 @@ def calc_metrics( # noqa # pylint: disable=too-many-branches,too-many-locals,too-many-statements - metrics: tp.Dict[str, MetricAtK], + metrics: tp.Mapping[str, MetricAtK], reco: pd.DataFrame, interactions: tp.Optional[pd.DataFrame] = None, prev_interactions: tp.Optional[pd.DataFrame] = None, @@ -50,7 +50,7 @@ def calc_metrics( # noqa # pylint: disable=too-many-branches,too-many-locals,t Dict of metric objects to calculate, where key is metric name and value is metric object. reco : pd.DataFrame - Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. + Recommendations table with columns `Columns.User`, `Columns.Item`, `Columns.Rank`. interactions : pd.DataFrame, optional Interactions table with columns `Columns.User`, `Columns.Item`. Obligatory only for some types of metrics. diff --git a/rectools/utils/misc.py b/rectools/utils/misc.py index e6b017b0..03aa6dd9 100644 --- a/rectools/utils/misc.py +++ b/rectools/utils/misc.py @@ -137,7 +137,7 @@ def is_instance(obj: tp.Any, types: tp.Union[AnyType, tp.Tuple[AnyType, ...]]) - def select_by_type( - objects: tp.Dict[tp.Any, tp.Any], + objects: tp.Mapping[tp.Any, tp.Any], types: tp.Union[AnyType, tp.Tuple[AnyType, ...]], ) -> tp.Dict[tp.Any, tp.Any]: """ diff --git a/tests/metrics/test_auc.py b/tests/metrics/test_auc.py index 9356ae30..1028810a 100644 --- a/tests/metrics/test_auc.py +++ b/tests/metrics/test_auc.py @@ -15,15 +15,18 @@ # pylint: disable=attribute-defined-outside-init import typing as tp +from copy import copy import numpy as np import pandas as pd import pytest from rectools import Columns +from rectools.metrics import DebiasConfig, debias_interactions from rectools.metrics.auc import PAP, InsufficientHandling, PartialAUC EMPTY_INTERACTIONS = pd.DataFrame(columns=[Columns.User, Columns.Item], dtype=int) +DEBIAS_CONFIG = DebiasConfig(iqr_coef=1.5, random_state=32) class TestPartialAUC: @@ -277,3 +280,94 @@ def test_when_duplicates_in_interactions_insufficient( dtype=float, ) pd.testing.assert_series_equal(metric.calc_per_user(reco, interactions), expected_metric_per_user) + + +class TestDebiasableAUCMetric: + def setup_method(self) -> None: + self.reco = pd.DataFrame( + { + Columns.User: [1, 2, 3, 3, 3, 4, 5, 5, 5, 5], + Columns.Item: [1, 2, 1, 2, 3, 1, 1, 2, 3, 5], + Columns.Rank: [9, 1, 1, 2, 3, 1, 3, 7, 9, 1], + } + ) + self.interactions = pd.DataFrame( + { + Columns.User: [1, 2, 3, 3, 3, 4, 5, 5, 5, 5], + Columns.Item: [1, 1, 1, 2, 3, 1, 1, 2, 3, 4], + } + ) + + @pytest.mark.parametrize( + "metric", + ( + PartialAUC(k=1, insufficient_handling=InsufficientHandling.IGNORE), + PartialAUC(k=3, insufficient_handling=InsufficientHandling.IGNORE), + PartialAUC(k=1, insufficient_handling=InsufficientHandling.EXCLUDE), + PartialAUC(k=1, insufficient_handling=InsufficientHandling.EXCLUDE), + PAP(k=1, insufficient_handling=InsufficientHandling.IGNORE), + PAP(k=3, insufficient_handling=InsufficientHandling.IGNORE), + PAP(k=1, insufficient_handling=InsufficientHandling.EXCLUDE), + PAP(k=3, insufficient_handling=InsufficientHandling.EXCLUDE), + ), + ) + def test_calc(self, metric: tp.Union[PartialAUC, PAP]) -> None: + debiased_metric = copy(metric) + debiased_metric.debias_config = DEBIAS_CONFIG + + debiased_interactions = debias_interactions(self.interactions, config=DEBIAS_CONFIG) + expected_metric_per_user = metric.calc_per_user(self.reco, debiased_interactions) + + actual_metric_per_user = debiased_metric.calc_per_user(self.reco, self.interactions) + actual_metric = debiased_metric.calc(self.reco, self.interactions) + + pd.testing.assert_series_equal(actual_metric_per_user, expected_metric_per_user) + assert actual_metric == expected_metric_per_user.mean() + + @pytest.mark.parametrize( + "debiased_metric", + ( + PartialAUC(k=3, insufficient_handling=InsufficientHandling.IGNORE, debias_config=DEBIAS_CONFIG), + PartialAUC(k=3, insufficient_handling=InsufficientHandling.EXCLUDE, debias_config=DEBIAS_CONFIG), + PAP(k=3, insufficient_handling=InsufficientHandling.IGNORE, debias_config=DEBIAS_CONFIG), + PAP(k=3, insufficient_handling=InsufficientHandling.EXCLUDE, debias_config=DEBIAS_CONFIG), + ), + ) + def test_when_no_interactions(self, debiased_metric: tp.Union[PartialAUC, PAP]) -> None: + expected_metric_per_user = pd.Series(index=pd.Series(name=Columns.User, dtype=int), dtype=np.float64) + + pd.testing.assert_series_equal( + debiased_metric.calc_per_user(self.reco, EMPTY_INTERACTIONS), + expected_metric_per_user, + ) + assert np.isnan(debiased_metric.calc(self.reco, EMPTY_INTERACTIONS)) + + @pytest.mark.parametrize( + "metric", + ( + PartialAUC(k=3), + PAP(k=3), + PartialAUC(k=3, debias_config=DEBIAS_CONFIG), + PAP(k=3, debias_config=DEBIAS_CONFIG), + ), + ) + def test_raise_when_correct_is_debias(self, metric: tp.Union[PartialAUC, PAP]) -> None: + fitted = metric.fit( + self.reco, self.interactions, metric.k, metric.insufficient_handling != InsufficientHandling.IGNORE + ) + result = metric.calc_from_fitted(fitted, is_debiased=metric.debias_config is not None) + assert isinstance(result, float) + + @pytest.mark.parametrize( + "metric", + ( + PartialAUC(k=3, debias_config=DEBIAS_CONFIG), + PAP(k=3, debias_config=DEBIAS_CONFIG), + ), + ) + def test_raise_when_incorrect_is_debias(self, metric: tp.Union[PartialAUC, PAP]) -> None: + fitted = metric.fit( + self.reco, self.interactions, metric.k, metric.insufficient_handling != InsufficientHandling.IGNORE + ) + with pytest.raises(ValueError): + metric.calc_from_fitted(fitted) diff --git a/tests/metrics/test_classification.py b/tests/metrics/test_classification.py index f76ad13a..93824690 100644 --- a/tests/metrics/test_classification.py +++ b/tests/metrics/test_classification.py @@ -14,14 +14,21 @@ # pylint: disable=attribute-defined-outside-init +from copy import copy + import numpy as np import pandas as pd import pytest from rectools import Columns -from rectools.metrics import MCC, Accuracy, F1Beta, HitRate, Precision, Recall -from rectools.metrics.base import MetricAtK -from rectools.metrics.classification import ClassificationMetric, calc_classification_metrics +from rectools.metrics import MCC, Accuracy, DebiasConfig, F1Beta, HitRate, Precision, Recall, debias_interactions +from rectools.metrics.base import merge_reco +from rectools.metrics.classification import ( + ClassificationMetric, + SimpleClassificationMetric, + calc_classification_metrics, + calc_confusions, +) RECO = pd.DataFrame( { @@ -38,6 +45,7 @@ ) CATALOG = list(range(10)) EMPTY_INTERACTIONS = pd.DataFrame(columns=[Columns.User, Columns.Item], dtype=int) +DEBIAS_CONFIG = DebiasConfig(iqr_coef=1.5, random_state=32) class TestPrecision: @@ -112,14 +120,6 @@ def test_when_no_interactions(self) -> None: class TestCalcClassificationMetrics: - def test_raises_when_unexpected_metric_type(self) -> None: - metric = MetricAtK(k=1) - with pytest.raises(TypeError): - calc_classification_metrics( - {"m": metric}, # type: ignore - pd.DataFrame(columns=[Columns.User, Columns.Item, Columns.Rank]), - ) - def test_raises_when_no_catalog_set_when_needed(self) -> None: metric = ClassificationMetric(k=1) with pytest.raises(ValueError): @@ -195,3 +195,153 @@ def test_when_no_interactions(self) -> None: expected_metric_per_user, ) assert np.isnan(self.metric.calc(RECO, EMPTY_INTERACTIONS)) + + +class TestDebiasableClassificationMetric: + @pytest.mark.parametrize( + "metric", + ( + Accuracy(k=2), + MCC(k=2), + ), + ) + def test_calc(self, metric: ClassificationMetric) -> None: + debiased_metric = copy(metric) + debiased_metric.debias_config = DEBIAS_CONFIG + + debiased_interactions = debias_interactions(INTERACTIONS, config=DEBIAS_CONFIG) + expected_metric_per_user = metric.calc_per_user(RECO, debiased_interactions, CATALOG) + + actual_metric_per_user = debiased_metric.calc_per_user(RECO, INTERACTIONS, CATALOG) + actual_metric = debiased_metric.calc(RECO, INTERACTIONS, CATALOG) + + pd.testing.assert_series_equal(actual_metric_per_user, expected_metric_per_user) + assert actual_metric == expected_metric_per_user.mean() + + @pytest.mark.parametrize( + "debiased_metric", + ( + Accuracy(k=2, debias_config=DEBIAS_CONFIG), + MCC(k=2, debias_config=DEBIAS_CONFIG), + ), + ) + def test_when_no_interactions(self, debiased_metric: ClassificationMetric) -> None: + expected_metric_per_user = pd.Series(index=pd.Series(name=Columns.User, dtype=int), dtype=np.float64) + + calc_per_user_result = debiased_metric.calc_per_user(RECO, EMPTY_INTERACTIONS, CATALOG) + calc_result = debiased_metric.calc(RECO, EMPTY_INTERACTIONS, CATALOG) + + pd.testing.assert_series_equal(calc_per_user_result, expected_metric_per_user) + assert np.isnan(calc_result) + + @pytest.mark.parametrize( + "metric", + ( + Accuracy(k=3), + Accuracy(k=3, debias_config=DEBIAS_CONFIG), + MCC(k=3), + MCC(k=3, debias_config=DEBIAS_CONFIG), + ), + ) + def test_raise_when_correct_is_debias(self, metric: ClassificationMetric) -> None: + merged = merge_reco(RECO, INTERACTIONS) + confusion_df = calc_confusions(merged, k=metric.k) + result = metric.calc_from_confusion_df(confusion_df, CATALOG, is_debiased=metric.debias_config is not None) + assert isinstance(result, float) + + @pytest.mark.parametrize( + "metric", + ( + Accuracy(k=3, debias_config=DEBIAS_CONFIG), + MCC(k=3, debias_config=DEBIAS_CONFIG), + ), + ) + def test_raise_when_incorrect_is_debias(self, metric: ClassificationMetric) -> None: + merged = merge_reco(RECO, INTERACTIONS) + confusion_df = calc_confusions(merged, k=metric.k) + with pytest.raises(ValueError): + metric.calc_from_confusion_df(confusion_df, CATALOG) + + +class TestDebiasableSimpleClassificationMetric: + @pytest.mark.parametrize( + "metric", + ( + Precision(k=2), + Precision(k=2, r_precision=True), + Recall(k=2), + F1Beta(k=2), + HitRate(k=2), + ), + ) + def test_calc( + self, + metric: SimpleClassificationMetric, + ) -> None: + debiased_metric = copy(metric) + debiased_metric.debias_config = DEBIAS_CONFIG + + debiased_interactions = debias_interactions(INTERACTIONS, config=DEBIAS_CONFIG) + expected_metric_per_user = metric.calc_per_user(RECO, debiased_interactions) + + actual_metric_per_user = debiased_metric.calc_per_user(RECO, INTERACTIONS) + actual_metric = debiased_metric.calc(RECO, INTERACTIONS) + + pd.testing.assert_series_equal(actual_metric_per_user, expected_metric_per_user) + assert actual_metric == expected_metric_per_user.mean() + + @pytest.mark.parametrize( + "debiased_metric", + ( + Precision(k=2, debias_config=DEBIAS_CONFIG), + Precision(k=2, r_precision=True, debias_config=DEBIAS_CONFIG), + Recall(k=2, debias_config=DEBIAS_CONFIG), + F1Beta(k=2, debias_config=DEBIAS_CONFIG), + HitRate(k=2, debias_config=DEBIAS_CONFIG), + ), + ) + def test_when_no_interactions(self, debiased_metric: SimpleClassificationMetric) -> None: + expected_metric_per_user = pd.Series(index=pd.Series(name=Columns.User, dtype=int), dtype=np.float64) + + calc_per_user_result = debiased_metric.calc_per_user(RECO, EMPTY_INTERACTIONS) + calc_result = debiased_metric.calc(RECO, EMPTY_INTERACTIONS) + + pd.testing.assert_series_equal(calc_per_user_result, expected_metric_per_user) + assert np.isnan(calc_result) + + @pytest.mark.parametrize( + "metric", + ( + Precision(k=3), + Precision(k=3, debias_config=DEBIAS_CONFIG), + Precision(k=3, r_precision=True), + Precision(k=3, r_precision=True, debias_config=DEBIAS_CONFIG), + Recall(k=3), + Recall(k=3, debias_config=DEBIAS_CONFIG), + F1Beta(k=3), + F1Beta(k=3, debias_config=DEBIAS_CONFIG), + HitRate(k=3), + HitRate(k=3, debias_config=DEBIAS_CONFIG), + ), + ) + def test_raise_when_correct_is_debias(self, metric: SimpleClassificationMetric) -> None: + merged = merge_reco(RECO, INTERACTIONS) + confusion_df = calc_confusions(merged, k=metric.k) + result = metric.calc_from_confusion_df(confusion_df, is_debiased=metric.debias_config is not None) + assert isinstance(result, float) + + @pytest.mark.parametrize( + "metric", + ( + Precision(k=3, debias_config=DEBIAS_CONFIG), + Precision(k=3, r_precision=True, debias_config=DEBIAS_CONFIG), + Recall(k=3, debias_config=DEBIAS_CONFIG), + F1Beta(k=3, debias_config=DEBIAS_CONFIG), + HitRate(k=3, debias_config=DEBIAS_CONFIG), + ), + ) + def test_raise_when_incorrect_is_debias(self, metric: SimpleClassificationMetric) -> None: + merged = merge_reco(RECO, INTERACTIONS) + confusion_df = calc_confusions(merged, k=metric.k) + with pytest.raises(ValueError): + metric.calc_from_confusion_df(confusion_df) diff --git a/tests/metrics/test_debias.py b/tests/metrics/test_debias.py new file mode 100644 index 00000000..d1267b35 --- /dev/null +++ b/tests/metrics/test_debias.py @@ -0,0 +1,230 @@ +# Copyright 2024 MTS (Mobile Telesystems) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import typing as tp +from collections import defaultdict + +import pandas as pd +import pytest + +from rectools import Columns +from rectools.metrics import MAP, MCC, MRR, NDCG, PAP, Accuracy, F1Beta, HitRate, PartialAUC, Precision, Recall +from rectools.metrics.base import merge_reco +from rectools.metrics.debias import ( + DebiasConfig, + DebiasableMetrikAtK, + calc_debiased_fit_task, + debias_for_metric_configs, + debias_interactions, +) + +DEBIAS_CONFIG_DEFAULT = DebiasConfig(iqr_coef=1.5, random_state=32) + + +class TestDebias: + @pytest.fixture + def interactions(self) -> pd.DataFrame: + interactions_df = pd.DataFrame( + { + Columns.User: [1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 5, 5, 5, 6, 7, 8], + Columns.Item: [1, 2, 1, 1, 2, 3, 4, 5, 6, 1, 1, 2, 3, 1, 1, 1], + } + ) + return interactions_df + + @pytest.fixture + def recommendations(self) -> pd.DataFrame: + reco_df = pd.DataFrame( + { + Columns.User: [1, 1, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 7, 8, 9], + Columns.Item: [1, 3, 1, 1, 2, 3, 4, 5, 1, 1, 2, 3, 1, 2, 1], + Columns.Rank: [9, 1, 3, 1, 3, 5, 7, 9, 1, 1, 2, 3, 2, 1, 1], + } + ) + return reco_df + + @pytest.fixture + def empty_interactions(self) -> pd.DataFrame: + return pd.DataFrame(columns=[Columns.User, Columns.Item], dtype=int) + + def test_debias_interactions(self, interactions: pd.DataFrame, recommendations: pd.DataFrame) -> None: + merged = merge_reco(recommendations, interactions) + + expected_result = pd.DataFrame( + { + Columns.User: [1, 1, 2, 3, 3, 3, 3, 3, 3, 5, 5, 5, 7], + Columns.Item: [1, 2, 1, 1, 2, 3, 4, 5, 6, 1, 2, 3, 1], + } + ) + expected_result = pd.merge( + expected_result, + recommendations, + how="left", + on=Columns.UserItem, + ) + + interactions_downsampling = debias_interactions(interactions, config=DEBIAS_CONFIG_DEFAULT) + merged_downsampling = debias_interactions(merged, config=DEBIAS_CONFIG_DEFAULT) + + pd.testing.assert_frame_equal( + interactions_downsampling.sort_values(Columns.UserItem, ignore_index=True), + expected_result[Columns.UserItem], + ) + pd.testing.assert_frame_equal( + merged_downsampling.sort_values(Columns.UserItem, ignore_index=True), expected_result + ) + + def test_debias_interactions_when_no_interactions(self, empty_interactions: pd.DataFrame) -> None: + interactions_downsampling = debias_interactions(empty_interactions, config=DEBIAS_CONFIG_DEFAULT) + pd.testing.assert_frame_equal(interactions_downsampling, empty_interactions, check_like=True) + + @pytest.mark.parametrize( + "metrics, prev_metrics", + ( + ( + { + "dMAP@1": MAP(k=1, debias_config=DEBIAS_CONFIG_DEFAULT), + "dMAP@3": MAP(k=3, debias_config=DEBIAS_CONFIG_DEFAULT), + "dMAP@2": MAP(k=2, debias_config=DebiasConfig(iqr_coef=1.6, random_state=32)), + "dMAP@4": MAP(k=4, debias_config=DebiasConfig(iqr_coef=1.6, random_state=10)), + "dMAP@5": MAP(k=5, debias_config=DebiasConfig(iqr_coef=1, random_state=10)), + "MAP@1": MAP(k=1), + "MAP@5": MAP(k=5), + }, + None, + ), + ( + { + "dPartialAUC@1": PartialAUC(k=1, debias_config=DEBIAS_CONFIG_DEFAULT), + "dPartialAUC@3": PartialAUC(k=3, debias_config=DEBIAS_CONFIG_DEFAULT), + "dPartialAUC@2": PartialAUC(k=2, debias_config=DebiasConfig(iqr_coef=1.6, random_state=32)), + "dPartialAUC@4": PartialAUC(k=4, debias_config=DebiasConfig(iqr_coef=1.6, random_state=10)), + "dPartialAUC@5": PartialAUC(k=5, debias_config=DebiasConfig(iqr_coef=1, random_state=10)), + "PartialAUC@1": PartialAUC(k=1), + "PartialAUC@5": PartialAUC(k=5), + }, + None, + ), + ( + { + "dPAP@1": PAP(k=1, debias_config=DEBIAS_CONFIG_DEFAULT), + "dPAP@3": PAP(k=3, debias_config=DEBIAS_CONFIG_DEFAULT), + "dPAP@2": PAP(k=2, debias_config=DebiasConfig(iqr_coef=1.6, random_state=32)), + "dPAP@4": PAP(k=4, debias_config=DebiasConfig(iqr_coef=1.6, random_state=10)), + "dPAP@5": PAP(k=5, debias_config=DebiasConfig(iqr_coef=1, random_state=10)), + "PAP@1": PAP(k=1), + "PAP@5": PAP(k=5), + }, + { + "dPAP@3": PAP(k=3, debias_config=DEBIAS_CONFIG_DEFAULT), + "dPAP@2": PAP(k=2, debias_config=DebiasConfig(iqr_coef=1.6, random_state=32)), + }, + ), + ), + ) + def test_calc_debiased_fit_task( + self, + metrics: tp.Dict[str, DebiasableMetrikAtK], + prev_metrics: tp.Optional[tp.Dict[str, DebiasableMetrikAtK]], + interactions: pd.DataFrame, + ) -> None: + prev_debiased_interactions = None + if prev_metrics is not None: + prev_debiased_interactions = debias_for_metric_configs( + metrics=prev_metrics.values(), interactions=interactions + ) + + debiased_fit_task = calc_debiased_fit_task( + metrics=metrics.values(), interactions=interactions, prev_debiased_interactions=prev_debiased_interactions + ) + + unique_debias_config_expected = set() + k_max_expected: tp.Dict[DebiasConfig, int] = defaultdict(int) + for metric in metrics.values(): + unique_debias_config_expected.add(metric.debias_config) + k_max_expected[metric.debias_config] = max(k_max_expected[metric.debias_config], metric.k) + + assert set(debiased_fit_task.keys()) == unique_debias_config_expected + for value in k_max_expected: + assert debiased_fit_task[value][0] == k_max_expected[value] + + @pytest.mark.parametrize( + "metrics, prev_metrics", + ( + ( + { + "dMCC@1": MCC(k=1, debias_config=DEBIAS_CONFIG_DEFAULT), + "MCC@3": MCC(k=3), + "dAccuracy@5": Accuracy(k=5, debias_config=DEBIAS_CONFIG_DEFAULT), + "Accuracy@2": Accuracy(k=2), + "dPrecision@4": Precision(k=4, debias_config=DebiasConfig(iqr_coef=1, random_state=10)), + "Precision@1": Precision(k=1), + "dRecall@4": Recall(k=4, debias_config=DebiasConfig(iqr_coef=1, random_state=10)), + "Recall@1": Precision(k=1), + "dF1Beta@10": F1Beta(k=10, debias_config=DEBIAS_CONFIG_DEFAULT), + "F1Beta@9": F1Beta(k=9), + "dHitRate@4": HitRate(k=4, debias_config=DebiasConfig(iqr_coef=1.1, random_state=10)), + "HitRate@6": HitRate(k=6), + }, + None, + ), + ( + { + "dNDCG@1": NDCG(k=1, debias_config=DEBIAS_CONFIG_DEFAULT), + "NDCG@3": NDCG(k=3), + "dMRR@5": MRR(k=5, debias_config=DebiasConfig(iqr_coef=2, random_state=10)), + "MRR@2": MRR(k=2), + }, + None, + ), + ( + { + "dMCC@1": MCC(k=1, debias_config=DEBIAS_CONFIG_DEFAULT), + "MCC@3": MCC(k=3), + "dAccuracy@5": Accuracy(k=5, debias_config=DEBIAS_CONFIG_DEFAULT), + "Accuracy@2": Accuracy(k=2), + "dPrecision@4": Precision(k=4, debias_config=DebiasConfig(iqr_coef=1, random_state=10)), + "Precision@1": Precision(k=1), + "dRecall@4": Recall(k=4, debias_config=DebiasConfig(iqr_coef=1, random_state=10)), + "Recall@1": Precision(k=1), + "dF1Beta@10": F1Beta(k=10, debias_config=DEBIAS_CONFIG_DEFAULT), + "F1Beta@9": F1Beta(k=9), + "dHitRate@4": HitRate(k=4, debias_config=DebiasConfig(iqr_coef=1.1, random_state=10)), + "HitRate@6": HitRate(k=6), + }, + { + "MCC@3": MCC(k=3), + "dAccuracy@5": Accuracy(k=5, debias_config=DEBIAS_CONFIG_DEFAULT), + "dRecall@4": Recall(k=4, debias_config=DebiasConfig(iqr_coef=1, random_state=10)), + }, + ), + ), + ) + def test_debias_for_metric_configs( + self, + metrics: tp.Dict[str, DebiasableMetrikAtK], + prev_metrics: tp.Optional[tp.Dict[str, DebiasableMetrikAtK]], + interactions: pd.DataFrame, + ) -> None: + prev_debiased_interactions = None + if prev_metrics is not None: + prev_debiased_interactions = debias_for_metric_configs( + metrics=prev_metrics.values(), interactions=interactions + ) + + debised_interactions = debias_for_metric_configs( + metrics=metrics.values(), interactions=interactions, prev_debiased_interactions=prev_debiased_interactions + ) + unique_debias_config_expected = set(metric.debias_config for metric in metrics.values()) + assert set(debised_interactions.keys()) == unique_debias_config_expected diff --git a/tests/metrics/test_ranking.py b/tests/metrics/test_ranking.py index ae48d01f..644d4e3e 100644 --- a/tests/metrics/test_ranking.py +++ b/tests/metrics/test_ranking.py @@ -15,15 +15,19 @@ # pylint: disable=attribute-defined-outside-init import typing as tp +from copy import copy import numpy as np import pandas as pd import pytest from rectools import Columns -from rectools.metrics.ranking import MAP, MRR, NDCG +from rectools.metrics import DebiasConfig, debias_interactions +from rectools.metrics.base import merge_reco +from rectools.metrics.ranking import MAP, MRR, NDCG, RankingMetric EMPTY_INTERACTIONS = pd.DataFrame(columns=[Columns.User, Columns.Item], dtype=int) +DEBIAS_CONFIG = DebiasConfig(iqr_coef=1.5, random_state=32) class TestMAP: @@ -193,3 +197,80 @@ def test_when_duplicates_in_interactions(self) -> None: dtype=float, ) pd.testing.assert_series_equal(metric.calc_per_user(reco, interactions), expected_metric_per_user) + + +class TestDebiasableRankingMetric: + def setup_method(self) -> None: + self.reco = pd.DataFrame( + { + Columns.User: [1, 1, 2, 3, 3, 3, 3, 3, 4, 5, 5, 5, 7, 8, 9], + Columns.Item: [1, 3, 1, 1, 2, 3, 4, 5, 1, 1, 2, 3, 1, 2, 1], + Columns.Rank: [9, 1, 3, 1, 3, 5, 7, 9, 1, 1, 2, 3, 2, 1, 1], + } + ) + self.interactions = pd.DataFrame( + { + Columns.User: [1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 5, 5, 5, 6, 7, 8], + Columns.Item: [1, 2, 1, 1, 2, 3, 4, 5, 6, 1, 1, 2, 3, 1, 1, 1], + } + ) + self.merged = merge_reco(self.reco, self.interactions) + + @pytest.mark.parametrize( + "metric", + ( + MAP(k=3), + NDCG(k=3), + MRR(k=3), + ), + ) + def test_calc(self, metric: RankingMetric) -> None: + debiased_metric = copy(metric) + debiased_metric.debias_config = DEBIAS_CONFIG + + debiased_interactions = debias_interactions(self.interactions, config=DEBIAS_CONFIG) + expected_metric_per_user = metric.calc_per_user(self.reco, debiased_interactions) + + actual_metric_per_user = debiased_metric.calc_per_user(self.reco, self.interactions) + actual_metric = debiased_metric.calc(self.reco, self.interactions) + + pd.testing.assert_series_equal(actual_metric_per_user, expected_metric_per_user) + assert actual_metric == expected_metric_per_user.mean() + + @pytest.mark.parametrize( + "debiased_metric", + ( + MAP(k=3, debias_config=DEBIAS_CONFIG), + NDCG(k=3, debias_config=DEBIAS_CONFIG), + MRR(k=3, debias_config=DEBIAS_CONFIG), + ), + ) + def test_when_no_interactions(self, debiased_metric: RankingMetric) -> None: + expected_metric_per_user = pd.Series(index=pd.Series(name=Columns.User, dtype=int), dtype=np.float64) + + pd.testing.assert_series_equal( + debiased_metric.calc_per_user(self.reco, EMPTY_INTERACTIONS), + expected_metric_per_user, + ) + assert np.isnan(debiased_metric.calc(self.reco, EMPTY_INTERACTIONS)) + + @pytest.mark.parametrize( + "metric", + ( + MAP(k=3), + MAP(k=3, debias_config=DEBIAS_CONFIG), + ), + ) + def test_raise_when_correct_is_debias(self, metric: MAP) -> None: + fitted = metric.fit(self.merged, metric.k) + result = metric.calc_from_fitted(fitted, is_debiased=metric.debias_config is not None) + assert isinstance(result, float) + + @pytest.mark.parametrize( + "metric", + (MAP(k=3, debias_config=DEBIAS_CONFIG),), + ) + def test_raise_when_incorrect_is_debias(self, metric: MAP) -> None: + fitted = metric.fit(self.merged, metric.k) + with pytest.raises(ValueError): + metric.calc_from_fitted(fitted) diff --git a/tests/metrics/test_scoring.py b/tests/metrics/test_scoring.py index f4378443..5fe3c932 100644 --- a/tests/metrics/test_scoring.py +++ b/tests/metrics/test_scoring.py @@ -20,12 +20,15 @@ from rectools import Columns from rectools.metrics import ( MAP, + MCC, MRR, NDCG, PAP, Accuracy, AvgRecPopularity, CoveredUsers, + DebiasConfig, + F1Beta, HitRate, Intersection, IntraListDiversity, @@ -38,6 +41,7 @@ SufficientReco, UnrepeatedReco, calc_metrics, + debias_interactions, ) from rectools.metrics.base import MetricAtK @@ -166,3 +170,52 @@ def test_raises(self, metric: MetricAtK, arg_names: tp.List[str]) -> None: kwargs = {name: getattr(self, name) for name in arg_names} with pytest.raises(ValueError): calc_metrics({"m": metric}, **kwargs) + + def test_success_debias(self) -> None: + debias_config = DebiasConfig(iqr_coef=1.5, random_state=32) + debiased_metrics = { + "debiased_precision@3": Precision(k=3, debias_config=debias_config), + "debiased_rprecision@3": Precision(k=3, r_precision=True, debias_config=debias_config), + "debiased_recall@3": Recall(k=3, debias_config=debias_config), + "debiased_f1beta@3": F1Beta(k=3, debias_config=debias_config), + "debiased_accuracy@3": Accuracy(k=3, debias_config=debias_config), + "debiased_mcc@3": MCC(k=3, debias_config=debias_config), + "debiased_hitrate@3": HitRate(k=3, debias_config=debias_config), + "debiased_map@1": MAP(k=1, debias_config=debias_config), + "debiased_map@3": MAP(k=3, debias_config=debias_config), + "debiased_ndcg@3": NDCG(k=3, debias_config=debias_config), + "debiased_mrr@3": MRR(k=3, debias_config=debias_config), + "debiased_pap@3": PAP(k=3, debias_config=debias_config), + "debiased_partauc@3": PartialAUC(k=3, debias_config=debias_config), + } + metrics = { + "debiased_precision@3": Precision(k=3), + "debiased_rprecision@3": Precision(k=3, r_precision=True), + "debiased_recall@3": Recall(k=3), + "debiased_f1beta@3": F1Beta(k=3), + "debiased_accuracy@3": Accuracy(k=3), + "debiased_mcc@3": MCC(k=3), + "debiased_hitrate@3": HitRate(k=3), + "debiased_map@1": MAP(k=1), + "debiased_map@3": MAP(k=3), + "debiased_ndcg@3": NDCG(k=3), + "debiased_mrr@3": MRR(k=3), + "debiased_pap@3": PAP(k=3), + "debiased_partauc@3": PartialAUC(k=3), + } + + debiased_interactions = debias_interactions(self.interactions, config=debias_config) + + actual = calc_metrics( + metrics=debiased_metrics, + reco=self.reco, + interactions=self.interactions, + catalog=self.catalog, + ) + expected = calc_metrics( + metrics=metrics, + reco=self.reco, + interactions=debiased_interactions, + catalog=self.catalog, + ) + assert actual == expected