Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RecencyWeightApproxBasket #25

Merged
merged 1 commit into from
Jan 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions jstark/features/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
RecencyWeightedBasket90,
RecencyWeightedBasket95,
RecencyWeightedBasket99,
RecencyWeightedApproxBasket90,
RecencyWeightedApproxBasket95,
RecencyWeightedApproxBasket99,
)
from .average_basket import AverageBasket

Expand Down Expand Up @@ -69,5 +72,8 @@
"RecencyWeightedBasket90",
"RecencyWeightedBasket95",
"RecencyWeightedBasket99",
"RecencyWeightedApproxBasket90",
"RecencyWeightedApproxBasket95",
"RecencyWeightedApproxBasket99",
"AverageBasket",
]
102 changes: 88 additions & 14 deletions jstark/features/recency_weighted_basket.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,31 @@

from jstark.feature_period import FeaturePeriod
from .basket_count import BasketCount
from .approx_basket_count import ApproxBasketCount


class RecencyWeightedBasket(DerivedFeature):
"""RecencyWeightedBasket feature"""
class RecencyWeightedApproxBasket(DerivedFeature):
"""RecencyWeightedApproxBasket"""

def __init__(
self, as_at: date, feature_period: FeaturePeriod, smoothing_factor: float
) -> None:
super().__init__(as_at, feature_period)
self.__smoothing_factor = smoothing_factor

@property
def smoothing_factor(self) -> float:
return self.__smoothing_factor

def column_expression(self) -> Column:
expr = f.lit(0.0)
for period in range(self.feature_period.end, self.feature_period.start + 1):
expr = expr + BasketCount(
expr = expr + ApproxBasketCount(
as_at=self.as_at,
feature_period=FeaturePeriod(
self.feature_period.period_unit_of_measure, period, period
),
).column * pow(self.__smoothing_factor, period)
).column * pow(self.smoothing_factor, period)
return expr

def default_value(self) -> Column:
Expand All @@ -37,8 +42,17 @@ def default_value(self) -> Column:
def description_subject(self) -> str:
return (
"Exponentially weighted moving average, with smoothing factor of"
+ f" {self.__smoothing_factor}, of the number of baskets per "
+ f"{self.feature_period.period_unit_of_measure.name.lower()}"
+ f" {self.smoothing_factor}, of the approximate number of baskets"
+ f" per {self.feature_period.period_unit_of_measure.name.lower()}"
)

@property
def feature_name(self) -> str:
return (
"RecencyWeightedApproxBasket"
+ f"{self.feature_period.period_unit_of_measure.name.title()}s"
+ f"{int(self.smoothing_factor*100)}"
+ f"_{self.feature_period.mnemonic}"
)

@property
Expand All @@ -49,7 +63,53 @@ def commentary(self) -> str:
+ " is an alternative to a simple moving average which"
+ " gives greater weighting to more recent observations, thus is an"
+ " exponentially weighted moving average. It uses a smoothing factor"
+ f" between 0 & 1 which for this feature is {self.__smoothing_factor}."
+ f" between 0 & 1 which for this feature is {self.smoothing_factor}."
+ " Here the approximate number of baskets per"
+ f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed."
+ " This feature is considered to be a highly effective predictor of future"
+ " purchases, if a customer has bought a product recently then there's a"
+ " relatively high probability they will buy it again."
+ f" This is less accurate than {self.feature_name.replace('Approx', '')}"
+ " though is less computationally expensive to calculate because it "
+ " does not calculate a distinct count for each"
+ f" {self.feature_period.period_unit_of_measure.name.lower()}."
)


class RecencyWeightedBasket(RecencyWeightedApproxBasket):
"""RecencyWeightedBasket feature"""

def __init__(
self, as_at: date, feature_period: FeaturePeriod, smoothing_factor: float
) -> None:
super().__init__(as_at, feature_period, smoothing_factor)

def column_expression(self) -> Column:
expr = f.lit(0.0)
for period in range(self.feature_period.end, self.feature_period.start + 1):
expr = expr + BasketCount(
as_at=self.as_at,
feature_period=FeaturePeriod(
self.feature_period.period_unit_of_measure, period, period
),
).column * pow(super().smoothing_factor, period)
return expr

@property
def description_subject(self) -> str:
"""simply RecencyWeightedApproxBasketXX_periodmenmonic's description with the
word approximate removed"""
return super().description_subject.replace("approximate ", "")

@property
def commentary(self) -> str:
return (
"Exponential smoothing "
+ "(https://en.wikipedia.org/wiki/Exponential_smoothing)"
+ " is an alternative to a simple moving average which"
+ " gives greater weighting to more recent observations, thus is an"
+ " exponentially weighted moving average. It uses a smoothing factor"
+ f" between 0 & 1 which for this feature is {self.smoothing_factor}."
+ " Here the number of baskets per"
+ f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed."
+ " This feature is considered to be a highly effective predictor of future"
Expand All @@ -67,17 +127,16 @@ def commentary(self) -> str:
+ f" ({self.feature_name}) is for"
+ f" {self.feature_period.number_of_periods}"
+ f" {self.feature_period.period_unit_of_measure.name.lower()}"
+ f"{'s' if self.feature_period.number_of_periods>1 else ''}"
+ f"{'s' if self.feature_period.number_of_periods>1 else ''}. You might"
+ f" consider using {self.feature_name.replace('Basket', 'ApproxBasket')}"
+ " instead which is less accurate but computationally cheaper."
)

@property
def feature_name(self) -> str:
return (
"RecencyWeightedBasket"
+ f"{self.feature_period.period_unit_of_measure.name.title()}s"
+ f"{int(self.__smoothing_factor*100)}"
+ f"_{self.feature_period.mnemonic}"
)
"""simply RecencyWeightedApproxBasketXX_periodmenmonic's with the
word approximate removed"""
return super().feature_name.replace("Approx", "")


class RecencyWeightedBasket90(RecencyWeightedBasket):
Expand All @@ -93,3 +152,18 @@ def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
class RecencyWeightedBasket99(RecencyWeightedBasket):
def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
super().__init__(as_at, feature_period, 0.99)


class RecencyWeightedApproxBasket90(RecencyWeightedApproxBasket):
def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
super().__init__(as_at, feature_period, 0.9)


class RecencyWeightedApproxBasket95(RecencyWeightedApproxBasket):
def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
super().__init__(as_at, feature_period, 0.95)


class RecencyWeightedApproxBasket99(RecencyWeightedApproxBasket):
def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
super().__init__(as_at, feature_period, 0.99)
6 changes: 6 additions & 0 deletions jstark/purchasing_feature_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
RecencyWeightedBasket99,
RecencyWeightedBasket90,
RecencyWeightedBasket95,
RecencyWeightedApproxBasket90,
RecencyWeightedApproxBasket95,
RecencyWeightedApproxBasket99,
AverageBasket,
)
from jstark.feature_generator import FeatureGenerator
Expand Down Expand Up @@ -86,5 +89,8 @@ def __init__(
RecencyWeightedBasket95,
RecencyWeightedBasket90,
RecencyWeightedBasket99,
RecencyWeightedApproxBasket95,
RecencyWeightedApproxBasket90,
RecencyWeightedApproxBasket99,
AverageBasket,
]
27 changes: 16 additions & 11 deletions tests/test_purchasing_feature_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ def get_dataframes_for_perweek_feature_tests(as_at_timestamp, luke_and_leia_purc
],
)
df = luke_and_leia_purchases.groupBy().agg(*fg.features)
df2 = df.select(
weeks_df = df.select(
"BasketCount_0w0",
"BasketCount_1w1",
"BasketCount_2w2",
Expand All @@ -452,26 +452,28 @@ def get_dataframes_for_perweek_feature_tests(as_at_timestamp, luke_and_leia_purc
"BasketCount_13w13",
)
df_first = df.first()
df2_first = df2.first()
weeks_df_first = weeks_df.first()
assert df_first is not None
assert df2_first is not None
return df_first, df2_first
assert weeks_df_first is not None
return df_first, weeks_df_first


def test_recencyweightedbasketweeks_luke_and_leia(
as_at_timestamp: datetime, luke_and_leia_purchases: DataFrame
):
"""Test RecencyWeightedBasketWeeks

This test verifies the correct value of RecencyWeightedBasketsWeeksXX by calculating
This test verifies the correct value of RecencyWeightedBasketWeeksXX by calculating
the BasketCount for each individual week, calculating the smoothed value for that ,
week then summing all those values. This is exactly the same calculation that is
performed by the feature generator so it might be argued that this test doesn't add
any value. I don't agree that that is the case however, it helps to demonstrate
exactly what this feature provides and given that its not an easy one to explain, I
think that has some value.

A later addition to this test also verifies RecencyWeightedApproxBasketWeeksXX
"""
df_first, df2_first = get_dataframes_for_perweek_feature_tests(
df_first, df_weeks_first = get_dataframes_for_perweek_feature_tests(
as_at_timestamp, luke_and_leia_purchases
)
recency_weighted_basket_count_weeks_90 = 0.0
Expand All @@ -481,13 +483,13 @@ def test_recencyweightedbasketweeks_luke_and_leia(
# Loop over all the weeks, calculate the smoothed value for each,
# then sum them all up
recency_weighted_basket_count_weeks_90 += float(
df2_first[f"BasketCount_{i}w{i}"]
df_weeks_first[f"BasketCount_{i}w{i}"]
) * pow(0.9, i)
recency_weighted_basket_count_weeks_95 += float(
df2_first[f"BasketCount_{i}w{i}"]
df_weeks_first[f"BasketCount_{i}w{i}"]
) * pow(0.95, i)
recency_weighted_basket_count_weeks_99 += float(
df2_first[f"BasketCount_{i}w{i}"]
df_weeks_first[f"BasketCount_{i}w{i}"]
) * pow(0.99, i)
assert recency_weighted_basket_count_weeks_90 == float(
df_first["RecencyWeightedBasketWeeks90_13w0"]
Expand All @@ -498,16 +500,19 @@ def test_recencyweightedbasketweeks_luke_and_leia(
assert recency_weighted_basket_count_weeks_99 == float(
df_first["RecencyWeightedBasketWeeks99_13w0"]
)
assert df_first["RecencyWeightedApproxBasketWeeks90_13w0"] == 3.943520489
assert df_first["RecencyWeightedApproxBasketWeeks95_13w0"] == 4.394755659724609
assert df_first["RecencyWeightedApproxBasketWeeks99_13w0"] == 4.864113257483641


def test_averagebasketsperweek_luke_and_leia(
as_at_timestamp: datetime, luke_and_leia_purchases: DataFrame
):
"""Test AverageBasketsPerWeek"""
df_first, df2_first = get_dataframes_for_perweek_feature_tests(
df_first, df_weeks_first = get_dataframes_for_perweek_feature_tests(
as_at_timestamp, luke_and_leia_purchases
)
n = 14
total_baskets = sum(df2_first[f"BasketCount_{i}w{i}"] for i in range(n))
total_baskets = sum(df_weeks_first[f"BasketCount_{i}w{i}"] for i in range(n))
average_baskets_per_week = total_baskets / n
assert average_baskets_per_week == df_first["AverageBasketsPerWeek_13w0"]