Skip to content

Commit

Permalink
optimization of calc_per_user method; tests added
Browse files Browse the repository at this point in the history
  • Loading branch information
Соколов Михаил committed Jan 17, 2024
1 parent 491a4bb commit efe60cd
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 25 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added
- Methods for conversion `Interactions` to raw form and for getting raw interactions from `Dataset` ([#69](https://github.com/MobileTeleSystems/RecTools/pull/69))
- `ARP (Average Recommendation Popularity)` to `metrics`
- `AvgRecPopularity (Average Recommendation Popularity)` to `metrics` ([#81](https://github.com/MobileTeleSystems/RecTools/pull/81))

### Changed
- Loosened `pandas`, `torch` and `torch-light` versions for `python >= 3.8` ([#58](https://github.com/MobileTeleSystems/RecTools/pull/58))
Expand Down
6 changes: 3 additions & 3 deletions rectools/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
`metrics.MRR`
`metrics.MeanInvUserFreq`
`metrics.IntraListDiversity`
`metrics.ARP`
`metrics.AvgRecPopularity`
`metrics.Serendipity`
Tools
Expand All @@ -50,7 +50,7 @@
)
from .diversity import IntraListDiversity
from .novelty import MeanInvUserFreq
from .popularity import ARP
from .popularity import AvgRecPopularity
from .ranking import MAP, MRR, NDCG
from .scoring import calc_metrics
from .serendipity import Serendipity
Expand All @@ -66,7 +66,7 @@
"MRR",
"MeanInvUserFreq",
"IntraListDiversity",
"ARP",
"AvgRecPopularity",
"Serendipity",
"calc_metrics",
"PairwiseDistanceCalculator",
Expand Down
29 changes: 14 additions & 15 deletions rectools/metrics/popularity.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,15 @@
"""Popularity metrics."""

import typing as tp
from collections import Counter

import attr
import pandas as pd

from rectools import Columns
from rectools.metrics.base import MetricAtK
from rectools.utils import select_by_type


@attr.s
class ARP(MetricAtK):
class AvgRecPopularity(MetricAtK):
r"""
Average Recommendations Popularity metric.
Expand All @@ -35,7 +32,7 @@ class ARP(MetricAtK):
for this item.
.. math::
ARP@k = \frac{1}{\left|U_{t}\right|}\sum_{u\in U_{t}^{}}\frac{\sum_{i\in L_{u}}\phi (i)}{\left | L_{u} \right |}
ARP@k = \frac{1}{\left|U_{t}\right|}\sum_{u\in U_{t}^{}}\frac{\sum_{i\in L_{u}}\phi (i)}{\left| L_{u} \right |}
where
:math:`\phi (i)` is the number of times item i has been rated in the training set.
Expand All @@ -62,9 +59,9 @@ class ARP(MetricAtK):
... Columns.Item: [1, 2, 1, 3, 1, 2],
... }
... )
>>> ARP(k=1).calc_per_user(reco, prev_interactions).values
>>> AvgRecPopularity(k=1).calc_per_user(reco, prev_interactions).values
array([3., 1., 1.])
>>> ARP(k=3).calc_per_user(reco, prev_interactions).values
>>> AvgRecPopularity(k=3).calc_per_user(reco, prev_interactions).values
array([2.5, 2. , 1.5])
"""

Expand Down Expand Up @@ -110,12 +107,13 @@ def calc_per_user(
Values of metric (index - user id, values - metric value for every user).
"""
pop_items = self.get_pop(prev_interactions)
arp = reco.groupby(Columns.User).apply(
lambda x: sum(pop_items[i] for i in x[Columns.Item][: self.k]) / len(x[Columns.Item][: self.k])
reco_prepared = reco.query(f"{Columns.Rank} <= @self.k")
arp = reco_prepared.groupby(Columns.User)[Columns.Item].agg(
lambda x: sum(pop_items[i] if i in pop_items else 0 for i in x) / x.nunique()
)
return arp

def get_pop(self, prev_interactions: pd.DataFrame) -> tp.Dict[int, int]:
def get_pop(self, prev_interactions: pd.DataFrame) -> pd.Series:
"""
Calculate rating for each item in train set.
Expand All @@ -127,13 +125,14 @@ def get_pop(self, prev_interactions: pd.DataFrame) -> tp.Dict[int, int]:
Returns
-------
dict(int->int)
Set with items' popularity rating (key - item id, value - number of interactions with item in training set).
pd.Series
Series with items' popularity rating (index - item id,
value - number of interactions with item in training set).
"""
return Counter(prev_interactions[Columns.Item])
return prev_interactions[Columns.Item].value_counts()


PopularityMetric = ARP
PopularityMetric = AvgRecPopularity


def calc_popularity_metrics(
Expand Down Expand Up @@ -167,7 +166,7 @@ def calc_popularity_metrics(
results = {}

# ARP
pop_metrics: tp.Dict[str, ARP] = select_by_type(metrics, ARP)
pop_metrics: tp.Dict[str, AvgRecPopularity] = select_by_type(metrics, AvgRecPopularity)
if pop_metrics:
for name, metric in pop_metrics.items():
results[name] = metric.calc(reco, prev_interactions)
Expand Down
49 changes: 46 additions & 3 deletions tests/metrics/test_popularity.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pandas as pd
import pytest

from rectools import Columns
from rectools.metrics.popularity import ARP
from rectools.metrics.popularity import AvgRecPopularity


class TestARP:
class TestAvgRecPopularity:
@pytest.fixture
def interactions(self) -> pd.DataFrame:
interactions = pd.DataFrame(
Expand Down Expand Up @@ -54,10 +55,52 @@ def recommendations(self) -> pd.DataFrame:
def test_correct_arp_values(
self, recommendations: pd.DataFrame, interactions: pd.DataFrame, k: int, expected: pd.Series
) -> None:
arp = ARP(k)
arp = AvgRecPopularity(k)

actual = arp.calc_per_user(recommendations, interactions)
pd.testing.assert_series_equal(actual, expected, check_names=False)

actual_mean = arp.calc(recommendations, interactions)
assert actual_mean == expected.mean()

def test_when_no_interactions(
self,
recommendations: pd.DataFrame,
) -> None:
expected = pd.Series(index=recommendations[Columns.User].unique(), data=[0.0, 0.0, 0.0])
empty_interactions = pd.DataFrame(columns=[Columns.User, Columns.Item], dtype=int)
arp = AvgRecPopularity(k=2)

actual = arp.calc_per_user(recommendations, empty_interactions)
pd.testing.assert_series_equal(actual, expected, check_names=False)

actual_mean = arp.calc(recommendations, empty_interactions)
assert actual_mean == expected.mean()

@pytest.mark.parametrize(
"k,expected",
(
(1, pd.Series(index=["u1", "u2", "u3"], data=[3.0, 1.0, 1.0])),
(3, pd.Series(index=["u1", "u2", "u3"], data=[2.5, np.divide(4, 3), 1.5])),
),
)
def test_when_new_item_in_reco(self, interactions: pd.DataFrame, k: int, expected: pd.Series) -> None:
reco = pd.DataFrame(
[
["u1", "i1", 1],
["u1", "i2", 2],
["u2", "i3", 1],
["u2", "i1", 2],
["u2", "i4", 3],
["u3", "i3", 1],
["u3", "i2", 2],
],
columns=[Columns.User, Columns.Item, Columns.Rank],
)
arp = AvgRecPopularity(k)

actual = arp.calc_per_user(reco, interactions)
pd.testing.assert_series_equal(actual, expected, check_names=False)

actual_mean = arp.calc(reco, interactions)
assert actual_mean == expected.mean()
6 changes: 3 additions & 3 deletions tests/metrics/test_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@

from rectools import Columns
from rectools.metrics import (
ARP,
MAP,
MRR,
NDCG,
Accuracy,
AvgRecPopularity,
IntraListDiversity,
MeanInvUserFreq,
PairwiseHammingDistanceCalculator,
Expand Down Expand Up @@ -77,7 +77,7 @@ def test_success(self) -> None:
"ndcg@1": NDCG(k=1, log_base=3),
"mrr@1": MRR(k=1),
"miuf": MeanInvUserFreq(k=3),
"arp": ARP(k=2),
"arp": AvgRecPopularity(k=2),
"ild": IntraListDiversity(k=3, distance_calculator=self.calculator),
"serendipity": Serendipity(k=3),
"custom": MetricAtK(k=1),
Expand Down Expand Up @@ -106,7 +106,7 @@ def test_success(self) -> None:
(Precision(k=1), ["reco"]),
(MAP(k=1), ["reco"]),
(MeanInvUserFreq(k=1), ["reco"]),
(ARP(k=1), ["reco"]),
(AvgRecPopularity(k=1), ["reco"]),
(Serendipity(k=1), ["reco"]),
(Serendipity(k=1), ["reco", "interactions"]),
(Serendipity(k=1), ["reco", "interactions", "prev_interactions"]),
Expand Down

0 comments on commit efe60cd

Please sign in to comment.