Skip to content

Commit

Permalink
Average Baskets per period
Browse files Browse the repository at this point in the history
  • Loading branch information
jamiekt committed Jan 13, 2023
1 parent f0a394c commit a39d5e7
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 15 deletions.
2 changes: 1 addition & 1 deletion jstark/feature_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class FeatureGenerator(metaclass=ABCMeta):
def __init__(
self,
as_at: date,
feature_periods: List[Union[FeaturePeriod, str]] = [
feature_periods: Union[List[FeaturePeriod], List[str]] = [
FeaturePeriod(PeriodUnitOfMeasure.WEEK, 52, 0),
],
) -> None:
Expand Down
2 changes: 2 additions & 0 deletions jstark/features/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
RecencyWeightedBasket95,
RecencyWeightedBasket99,
)
from .average_basket import AverageBasket

__all__ = [
"BaseFeature",
Expand Down Expand Up @@ -68,4 +69,5 @@
"RecencyWeightedBasket90",
"RecencyWeightedBasket95",
"RecencyWeightedBasket99",
"AverageBasket",
]
43 changes: 43 additions & 0 deletions jstark/features/average_basket.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"Average baskets per given periodunitofmeasure feature"
from .feature import DerivedFeature

from pyspark.sql import Column
import pyspark.sql.functions as f

from jstark.feature_period import FeaturePeriod
from .basket_count import BasketCount


class AverageBasket(DerivedFeature):
"Average baskets per given periodunitofmeasure feature"

def column_expression(self) -> Column:
return (
BasketCount(
as_at=self.as_at,
feature_period=FeaturePeriod(
self.feature_period.period_unit_of_measure,
self.feature_period.start,
self.feature_period.end,
),
).column
/ self.feature_period.number_of_periods
)

def default_value(self) -> Column:
return f.lit(None)

@property
def description_subject(self) -> str:
return (
"Average number of baskets per "
+ f"{self.feature_period.period_unit_of_measure.name.lower()}"
)

@property
def feature_name(self) -> str:
return (
"AverageBasketsPer"
+ f"{self.feature_period.period_unit_of_measure.name.title()}"
+ f"_{self.feature_period.mnemonic}"
)
4 changes: 3 additions & 1 deletion jstark/purchasing_feature_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
RecencyWeightedBasket99,
RecencyWeightedBasket90,
RecencyWeightedBasket95,
AverageBasket,
)
from jstark.feature_generator import FeatureGenerator

Expand All @@ -43,7 +44,7 @@ class PurchasingFeatureGenerator(FeatureGenerator):
def __init__(
self,
as_at: date,
feature_periods: List[Union[FeaturePeriod, str]] = [
feature_periods: Union[List[FeaturePeriod], List[str]] = [
FeaturePeriod(PeriodUnitOfMeasure.DAY, 2, 0),
FeaturePeriod(PeriodUnitOfMeasure.DAY, 4, 3),
],
Expand Down Expand Up @@ -85,4 +86,5 @@ def __init__(
RecencyWeightedBasket95,
RecencyWeightedBasket90,
RecencyWeightedBasket99,
AverageBasket,
]
46 changes: 33 additions & 13 deletions tests/test_purchasing_feature_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,19 +413,7 @@ def test_basketweeks_commentary(
)


def test_recencyweightedbasketweeks_luke_and_leia(
as_at_timestamp: datetime, luke_and_leia_purchases: DataFrame
):
"""Test RecencyWeightedBasketWeeks
This test verifies the correct value of RecencyWeightedBasketsWeeksXX by calculating
the BasketCount for each individual week, calculating the smoothed value for that ,
week then summing all those values. This is exactly the same calculation that is
performed by the feature generator so it might be argued that this test doesn't add
any value. I don't agree that that is the case however, it helps to demonstrate
exactly what this feature provides and given that its not an easy one to explain, I
think that has some value.
"""
def get_dataframes_for_perweek_feature_tests(as_at_timestamp, luke_and_leia_purchases):
fg = PurchasingFeatureGenerator(
as_at=as_at_timestamp.date(),
feature_periods=[
Expand Down Expand Up @@ -467,6 +455,25 @@ def test_recencyweightedbasketweeks_luke_and_leia(
df2_first = df2.first()
assert df_first is not None
assert df2_first is not None
return df_first, df2_first


def test_recencyweightedbasketweeks_luke_and_leia(
as_at_timestamp: datetime, luke_and_leia_purchases: DataFrame
):
"""Test RecencyWeightedBasketWeeks
This test verifies the correct value of RecencyWeightedBasketsWeeksXX by calculating
the BasketCount for each individual week, calculating the smoothed value for that ,
week then summing all those values. This is exactly the same calculation that is
performed by the feature generator so it might be argued that this test doesn't add
any value. I don't agree that that is the case however, it helps to demonstrate
exactly what this feature provides and given that its not an easy one to explain, I
think that has some value.
"""
df_first, df2_first = get_dataframes_for_perweek_feature_tests(
as_at_timestamp, luke_and_leia_purchases
)
recency_weighted_basket_count_weeks_90 = 0.0
recency_weighted_basket_count_weeks_95 = 0.0
recency_weighted_basket_count_weeks_99 = 0.0
Expand All @@ -491,3 +498,16 @@ def test_recencyweightedbasketweeks_luke_and_leia(
assert recency_weighted_basket_count_weeks_99 == float(
df_first["RecencyWeightedBasketWeeks99_13w0"]
)


def test_averagebasketsperweek_luke_and_leia(
as_at_timestamp: datetime, luke_and_leia_purchases: DataFrame
):
"""Test AverageBasketsPerWeek"""
df_first, df2_first = get_dataframes_for_perweek_feature_tests(
as_at_timestamp, luke_and_leia_purchases
)
n = 14
total_baskets = sum(df2_first[f"BasketCount_{i}w{i}"] for i in range(n))
average_baskets_per_week = total_baskets / n
assert average_baskets_per_week == df_first["AverageBasketsPerWeek_13w0"]

0 comments on commit a39d5e7

Please sign in to comment.