Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

optimization of calc_per_user method; tests added #1

Merged
merged 1 commit into from
Jan 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added
- Methods for conversion `Interactions` to raw form and for getting raw interactions from `Dataset` ([#69](https://github.com/MobileTeleSystems/RecTools/pull/69))
- `ARP (Average Recommendation Popularity)` to `metrics`
- `AvgRecPopularity (Average Recommendation Popularity)` to `metrics` ([#81](https://github.com/MobileTeleSystems/RecTools/pull/81))

### Changed
- Loosened `pandas`, `torch` and `torch-light` versions for `python >= 3.8` ([#58](https://github.com/MobileTeleSystems/RecTools/pull/58))
Expand Down
6 changes: 3 additions & 3 deletions rectools/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
`metrics.MRR`
`metrics.MeanInvUserFreq`
`metrics.IntraListDiversity`
`metrics.ARP`
`metrics.AvgRecPopularity`
`metrics.Serendipity`

Tools
Expand All @@ -50,7 +50,7 @@
)
from .diversity import IntraListDiversity
from .novelty import MeanInvUserFreq
from .popularity import ARP
from .popularity import AvgRecPopularity
from .ranking import MAP, MRR, NDCG
from .scoring import calc_metrics
from .serendipity import Serendipity
Expand All @@ -66,7 +66,7 @@
"MRR",
"MeanInvUserFreq",
"IntraListDiversity",
"ARP",
"AvgRecPopularity",
"Serendipity",
"calc_metrics",
"PairwiseDistanceCalculator",
Expand Down
29 changes: 14 additions & 15 deletions rectools/metrics/popularity.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,15 @@
"""Popularity metrics."""

import typing as tp
from collections import Counter

import attr
import pandas as pd

from rectools import Columns
from rectools.metrics.base import MetricAtK
from rectools.utils import select_by_type


@attr.s
class ARP(MetricAtK):
class AvgRecPopularity(MetricAtK):
r"""
Average Recommendations Popularity metric.

Expand All @@ -35,7 +32,7 @@ class ARP(MetricAtK):
for this item.

.. math::
ARP@k = \frac{1}{\left|U_{t}\right|}\sum_{u\in U_{t}^{}}\frac{\sum_{i\in L_{u}}\phi (i)}{\left | L_{u} \right |}
ARP@k = \frac{1}{\left|U_{t}\right|}\sum_{u\in U_{t}^{}}\frac{\sum_{i\in L_{u}}\phi (i)}{\left| L_{u} \right |}

where
:math:`\phi (i)` is the number of times item i has been rated in the training set.
Expand All @@ -62,9 +59,9 @@ class ARP(MetricAtK):
... Columns.Item: [1, 2, 1, 3, 1, 2],
... }
... )
>>> ARP(k=1).calc_per_user(reco, prev_interactions).values
>>> AvgRecPopularity(k=1).calc_per_user(reco, prev_interactions).values
array([3., 1., 1.])
>>> ARP(k=3).calc_per_user(reco, prev_interactions).values
>>> AvgRecPopularity(k=3).calc_per_user(reco, prev_interactions).values
array([2.5, 2. , 1.5])
"""

Expand Down Expand Up @@ -110,12 +107,13 @@ def calc_per_user(
Values of metric (index - user id, values - metric value for every user).
"""
pop_items = self.get_pop(prev_interactions)
arp = reco.groupby(Columns.User).apply(
lambda x: sum(pop_items[i] for i in x[Columns.Item][: self.k]) / len(x[Columns.Item][: self.k])
reco_prepared = reco.query(f"{Columns.Rank} <= @self.k")
arp = reco_prepared.groupby(Columns.User)[Columns.Item].agg(
lambda x: sum(pop_items[i] if i in pop_items else 0 for i in x) / x.nunique()
)
return arp

def get_pop(self, prev_interactions: pd.DataFrame) -> tp.Dict[int, int]:
def get_pop(self, prev_interactions: pd.DataFrame) -> pd.Series:
"""
Calculate rating for each item in train set.

Expand All @@ -127,13 +125,14 @@ def get_pop(self, prev_interactions: pd.DataFrame) -> tp.Dict[int, int]:

Returns
-------
dict(int->int)
Set with items' popularity rating (key - item id, value - number of interactions with item in training set).
pd.Series
Series with items' popularity rating (index - item id,
value - number of interactions with item in training set).
"""
return Counter(prev_interactions[Columns.Item])
return prev_interactions[Columns.Item].value_counts()


PopularityMetric = ARP
PopularityMetric = AvgRecPopularity


def calc_popularity_metrics(
Expand Down Expand Up @@ -167,7 +166,7 @@ def calc_popularity_metrics(
results = {}

# ARP
pop_metrics: tp.Dict[str, ARP] = select_by_type(metrics, ARP)
pop_metrics: tp.Dict[str, AvgRecPopularity] = select_by_type(metrics, AvgRecPopularity)
if pop_metrics:
for name, metric in pop_metrics.items():
results[name] = metric.calc(reco, prev_interactions)
Expand Down
49 changes: 46 additions & 3 deletions tests/metrics/test_popularity.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pandas as pd
import pytest

from rectools import Columns
from rectools.metrics.popularity import ARP
from rectools.metrics.popularity import AvgRecPopularity


class TestARP:
class TestAvgRecPopularity:
@pytest.fixture
def interactions(self) -> pd.DataFrame:
interactions = pd.DataFrame(
Expand Down Expand Up @@ -54,10 +55,52 @@ def recommendations(self) -> pd.DataFrame:
def test_correct_arp_values(
self, recommendations: pd.DataFrame, interactions: pd.DataFrame, k: int, expected: pd.Series
) -> None:
arp = ARP(k)
arp = AvgRecPopularity(k)

actual = arp.calc_per_user(recommendations, interactions)
pd.testing.assert_series_equal(actual, expected, check_names=False)

actual_mean = arp.calc(recommendations, interactions)
assert actual_mean == expected.mean()

def test_when_no_interactions(
self,
recommendations: pd.DataFrame,
) -> None:
expected = pd.Series(index=recommendations[Columns.User].unique(), data=[0.0, 0.0, 0.0])
empty_interactions = pd.DataFrame(columns=[Columns.User, Columns.Item], dtype=int)
arp = AvgRecPopularity(k=2)

actual = arp.calc_per_user(recommendations, empty_interactions)
pd.testing.assert_series_equal(actual, expected, check_names=False)

actual_mean = arp.calc(recommendations, empty_interactions)
assert actual_mean == expected.mean()

@pytest.mark.parametrize(
"k,expected",
(
(1, pd.Series(index=["u1", "u2", "u3"], data=[3.0, 1.0, 1.0])),
(3, pd.Series(index=["u1", "u2", "u3"], data=[2.5, np.divide(4, 3), 1.5])),
),
)
def test_when_new_item_in_reco(self, interactions: pd.DataFrame, k: int, expected: pd.Series) -> None:
reco = pd.DataFrame(
[
["u1", "i1", 1],
["u1", "i2", 2],
["u2", "i3", 1],
["u2", "i1", 2],
["u2", "i4", 3],
["u3", "i3", 1],
["u3", "i2", 2],
],
columns=[Columns.User, Columns.Item, Columns.Rank],
)
arp = AvgRecPopularity(k)

actual = arp.calc_per_user(reco, interactions)
pd.testing.assert_series_equal(actual, expected, check_names=False)

actual_mean = arp.calc(reco, interactions)
assert actual_mean == expected.mean()
6 changes: 3 additions & 3 deletions tests/metrics/test_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@

from rectools import Columns
from rectools.metrics import (
ARP,
MAP,
MRR,
NDCG,
Accuracy,
AvgRecPopularity,
IntraListDiversity,
MeanInvUserFreq,
PairwiseHammingDistanceCalculator,
Expand Down Expand Up @@ -77,7 +77,7 @@ def test_success(self) -> None:
"ndcg@1": NDCG(k=1, log_base=3),
"mrr@1": MRR(k=1),
"miuf": MeanInvUserFreq(k=3),
"arp": ARP(k=2),
"arp": AvgRecPopularity(k=2),
"ild": IntraListDiversity(k=3, distance_calculator=self.calculator),
"serendipity": Serendipity(k=3),
"custom": MetricAtK(k=1),
Expand Down Expand Up @@ -106,7 +106,7 @@ def test_success(self) -> None:
(Precision(k=1), ["reco"]),
(MAP(k=1), ["reco"]),
(MeanInvUserFreq(k=1), ["reco"]),
(ARP(k=1), ["reco"]),
(AvgRecPopularity(k=1), ["reco"]),
(Serendipity(k=1), ["reco"]),
(Serendipity(k=1), ["reco", "interactions"]),
(Serendipity(k=1), ["reco", "interactions", "prev_interactions"]),
Expand Down
Loading