Average Baskets per period

jamiekt · Jan 13, 2023 · a39d5e7 · a39d5e7
1 parent f0a394c
commit a39d5e7
Show file tree

Hide file tree

Showing 5 changed files with 82 additions and 15 deletions.
diff --git a/jstark/feature_generator.py b/jstark/feature_generator.py
@@ -17,7 +17,7 @@ class FeatureGenerator(metaclass=ABCMeta):
     def __init__(
         self,
         as_at: date,
-        feature_periods: List[Union[FeaturePeriod, str]] = [
+        feature_periods: Union[List[FeaturePeriod], List[str]] = [
             FeaturePeriod(PeriodUnitOfMeasure.WEEK, 52, 0),
         ],
     ) -> None:

diff --git a/jstark/features/__init__.py b/jstark/features/__init__.py
@@ -33,6 +33,7 @@
     RecencyWeightedBasket95,
     RecencyWeightedBasket99,
 )
+from .average_basket import AverageBasket
 
 __all__ = [
     "BaseFeature",
@@ -68,4 +69,5 @@
     "RecencyWeightedBasket90",
     "RecencyWeightedBasket95",
     "RecencyWeightedBasket99",
+    "AverageBasket",
 ]
diff --git a/jstark/features/average_basket.py b/jstark/features/average_basket.py
@@ -0,0 +1,43 @@
+"Average baskets per given periodunitofmeasure feature"
+from .feature import DerivedFeature
+
+from pyspark.sql import Column
+import pyspark.sql.functions as f
+
+from jstark.feature_period import FeaturePeriod
+from .basket_count import BasketCount
+
+
+class AverageBasket(DerivedFeature):
+    "Average baskets per given periodunitofmeasure feature"
+
+    def column_expression(self) -> Column:
+        return (
+            BasketCount(
+                as_at=self.as_at,
+                feature_period=FeaturePeriod(
+                    self.feature_period.period_unit_of_measure,
+                    self.feature_period.start,
+                    self.feature_period.end,
+                ),
+            ).column
+            / self.feature_period.number_of_periods
+        )
+
+    def default_value(self) -> Column:
+        return f.lit(None)
+
+    @property
+    def description_subject(self) -> str:
+        return (
+            "Average number of baskets per "
+            + f"{self.feature_period.period_unit_of_measure.name.lower()}"
+        )
+
+    @property
+    def feature_name(self) -> str:
+        return (
+            "AverageBasketsPer"
+            + f"{self.feature_period.period_unit_of_measure.name.title()}"
+            + f"_{self.feature_period.mnemonic}"
+        )
diff --git a/jstark/purchasing_feature_generator.py b/jstark/purchasing_feature_generator.py
@@ -35,6 +35,7 @@
     RecencyWeightedBasket99,
     RecencyWeightedBasket90,
     RecencyWeightedBasket95,
+    AverageBasket,
 )
 from jstark.feature_generator import FeatureGenerator
 
@@ -43,7 +44,7 @@ class PurchasingFeatureGenerator(FeatureGenerator):
     def __init__(
         self,
         as_at: date,
-        feature_periods: List[Union[FeaturePeriod, str]] = [
+        feature_periods: Union[List[FeaturePeriod], List[str]] = [
             FeaturePeriod(PeriodUnitOfMeasure.DAY, 2, 0),
             FeaturePeriod(PeriodUnitOfMeasure.DAY, 4, 3),
         ],
@@ -85,4 +86,5 @@ def __init__(
         RecencyWeightedBasket95,
         RecencyWeightedBasket90,
         RecencyWeightedBasket99,
+        AverageBasket,
     ]
diff --git a/tests/test_purchasing_feature_generator.py b/tests/test_purchasing_feature_generator.py
@@ -413,19 +413,7 @@ def test_basketweeks_commentary(
     )
 
 
-def test_recencyweightedbasketweeks_luke_and_leia(
-    as_at_timestamp: datetime, luke_and_leia_purchases: DataFrame
-):
-    """Test RecencyWeightedBasketWeeks
-
-    This test verifies the correct value of RecencyWeightedBasketsWeeksXX by calculating
-    the BasketCount for each individual week, calculating the smoothed value for that ,
-    week then summing all those values. This is exactly the same calculation that is
-    performed by the feature generator so it might be argued that this test doesn't add
-    any value. I don't agree that that is the case however, it helps to demonstrate
-    exactly what this feature provides and given that its not an easy one to explain, I
-    think that has some value.
-    """
+def get_dataframes_for_perweek_feature_tests(as_at_timestamp, luke_and_leia_purchases):
     fg = PurchasingFeatureGenerator(
         as_at=as_at_timestamp.date(),
         feature_periods=[
@@ -467,6 +455,25 @@ def test_recencyweightedbasketweeks_luke_and_leia(
     df2_first = df2.first()
     assert df_first is not None
     assert df2_first is not None
+    return df_first, df2_first
+
+
+def test_recencyweightedbasketweeks_luke_and_leia(
+    as_at_timestamp: datetime, luke_and_leia_purchases: DataFrame
+):
+    """Test RecencyWeightedBasketWeeks
+
+    This test verifies the correct value of RecencyWeightedBasketsWeeksXX by calculating
+    the BasketCount for each individual week, calculating the smoothed value for that ,
+    week then summing all those values. This is exactly the same calculation that is
+    performed by the feature generator so it might be argued that this test doesn't add
+    any value. I don't agree that that is the case however, it helps to demonstrate
+    exactly what this feature provides and given that its not an easy one to explain, I
+    think that has some value.
+    """
+    df_first, df2_first = get_dataframes_for_perweek_feature_tests(
+        as_at_timestamp, luke_and_leia_purchases
+    )
     recency_weighted_basket_count_weeks_90 = 0.0
     recency_weighted_basket_count_weeks_95 = 0.0
     recency_weighted_basket_count_weeks_99 = 0.0
@@ -491,3 +498,16 @@ def test_recencyweightedbasketweeks_luke_and_leia(
     assert recency_weighted_basket_count_weeks_99 == float(
         df_first["RecencyWeightedBasketWeeks99_13w0"]
     )
+
+
+def test_averagebasketsperweek_luke_and_leia(
+    as_at_timestamp: datetime, luke_and_leia_purchases: DataFrame
+):
+    """Test AverageBasketsPerWeek"""
+    df_first, df2_first = get_dataframes_for_perweek_feature_tests(
+        as_at_timestamp, luke_and_leia_purchases
+    )
+    n = 14
+    total_baskets = sum(df2_first[f"BasketCount_{i}w{i}"] for i in range(n))
+    average_baskets_per_week = total_baskets / n
+    assert average_baskets_per_week == df_first["AverageBasketsPerWeek_13w0"]