jamiekt · jamiekt · Jan 14, 2023 · Jan 14, 2023
diff --git a/jstark/features/__init__.py b/jstark/features/__init__.py
@@ -32,6 +32,9 @@
     RecencyWeightedBasket90,
     RecencyWeightedBasket95,
     RecencyWeightedBasket99,
+    RecencyWeightedApproxBasket90,
+    RecencyWeightedApproxBasket95,
+    RecencyWeightedApproxBasket99,
 )
 from .average_basket import AverageBasket
 
@@ -69,5 +72,8 @@
     "RecencyWeightedBasket90",
     "RecencyWeightedBasket95",
     "RecencyWeightedBasket99",
+    "RecencyWeightedApproxBasket90",
+    "RecencyWeightedApproxBasket95",
+    "RecencyWeightedApproxBasket99",
     "AverageBasket",
 ]
diff --git a/jstark/features/recency_weighted_basket.py b/jstark/features/recency_weighted_basket.py
@@ -8,26 +8,31 @@
 
 from jstark.feature_period import FeaturePeriod
 from .basket_count import BasketCount
+from .approx_basket_count import ApproxBasketCount
 
 
-class RecencyWeightedBasket(DerivedFeature):
-    """RecencyWeightedBasket feature"""
+class RecencyWeightedApproxBasket(DerivedFeature):
+    """RecencyWeightedApproxBasket"""
 
     def __init__(
         self, as_at: date, feature_period: FeaturePeriod, smoothing_factor: float
     ) -> None:
         super().__init__(as_at, feature_period)
         self.__smoothing_factor = smoothing_factor
 
+    @property
+    def smoothing_factor(self) -> float:
+        return self.__smoothing_factor
+
     def column_expression(self) -> Column:
         expr = f.lit(0.0)
         for period in range(self.feature_period.end, self.feature_period.start + 1):
-            expr = expr + BasketCount(
+            expr = expr + ApproxBasketCount(
                 as_at=self.as_at,
                 feature_period=FeaturePeriod(
                     self.feature_period.period_unit_of_measure, period, period
                 ),
-            ).column * pow(self.__smoothing_factor, period)
+            ).column * pow(self.smoothing_factor, period)
         return expr
 
     def default_value(self) -> Column:
@@ -37,8 +42,17 @@ def default_value(self) -> Column:
     def description_subject(self) -> str:
         return (
             "Exponentially weighted moving average, with smoothing factor of"
-            + f" {self.__smoothing_factor}, of the number of baskets per "
-            + f"{self.feature_period.period_unit_of_measure.name.lower()}"
+            + f" {self.smoothing_factor}, of the approximate number of baskets"
+            + f" per {self.feature_period.period_unit_of_measure.name.lower()}"
+        )
+
+    @property
+    def feature_name(self) -> str:
+        return (
+            "RecencyWeightedApproxBasket"
+            + f"{self.feature_period.period_unit_of_measure.name.title()}s"
+            + f"{int(self.smoothing_factor*100)}"
+            + f"_{self.feature_period.mnemonic}"
         )
 
     @property
@@ -49,7 +63,53 @@ def commentary(self) -> str:
             + " is an alternative to a simple moving average which"
             + " gives greater weighting to more recent observations, thus is an"
             + " exponentially weighted moving average. It uses a smoothing factor"
-            + f" between 0 & 1 which for this feature is {self.__smoothing_factor}."
+            + f" between 0 & 1 which for this feature is {self.smoothing_factor}."
+            + " Here the approximate number of baskets per"
+            + f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed."
+            + " This feature is considered to be a highly effective predictor of future"
+            + " purchases, if a customer has bought a product recently then there's a"
+            + " relatively high probability they will buy it again."
+            + f" This is less accurate than {self.feature_name.replace('Approx', '')}"
+            + " though is less computationally expensive to calculate because it "
+            + " does not calculate a distinct count for each"
+            + f" {self.feature_period.period_unit_of_measure.name.lower()}."
+        )
+
+
+class RecencyWeightedBasket(RecencyWeightedApproxBasket):
+    """RecencyWeightedBasket feature"""
+
+    def __init__(
+        self, as_at: date, feature_period: FeaturePeriod, smoothing_factor: float
+    ) -> None:
+        super().__init__(as_at, feature_period, smoothing_factor)
+
+    def column_expression(self) -> Column:
+        expr = f.lit(0.0)
+        for period in range(self.feature_period.end, self.feature_period.start + 1):
+            expr = expr + BasketCount(
+                as_at=self.as_at,
+                feature_period=FeaturePeriod(
+                    self.feature_period.period_unit_of_measure, period, period
+                ),
+            ).column * pow(super().smoothing_factor, period)
+        return expr
+
+    @property
+    def description_subject(self) -> str:
+        """simply RecencyWeightedApproxBasketXX_periodmenmonic's description with the
+        word approximate removed"""
+        return super().description_subject.replace("approximate ", "")
+
+    @property
+    def commentary(self) -> str:
+        return (
+            "Exponential smoothing "
+            + "(https://en.wikipedia.org/wiki/Exponential_smoothing)"
+            + " is an alternative to a simple moving average which"
+            + " gives greater weighting to more recent observations, thus is an"
+            + " exponentially weighted moving average. It uses a smoothing factor"
+            + f" between 0 & 1 which for this feature is {self.smoothing_factor}."
             + " Here the number of baskets per"
             + f" {self.feature_period.period_unit_of_measure.name.lower()} is smoothed."
             + " This feature is considered to be a highly effective predictor of future"
@@ -67,17 +127,16 @@ def commentary(self) -> str:
             + f" ({self.feature_name}) is for"
             + f" {self.feature_period.number_of_periods}"
             + f" {self.feature_period.period_unit_of_measure.name.lower()}"
-            + f"{'s' if self.feature_period.number_of_periods>1 else ''}"
+            + f"{'s' if self.feature_period.number_of_periods>1 else ''}. You might"
+            + f" consider using {self.feature_name.replace('Basket', 'ApproxBasket')}"
+            + " instead which is less accurate but computationally cheaper."
         )
 
     @property
     def feature_name(self) -> str:
-        return (
-            "RecencyWeightedBasket"
-            + f"{self.feature_period.period_unit_of_measure.name.title()}s"
-            + f"{int(self.__smoothing_factor*100)}"
-            + f"_{self.feature_period.mnemonic}"
-        )
+        """simply RecencyWeightedApproxBasketXX_periodmenmonic's with the
+        word approximate removed"""
+        return super().feature_name.replace("Approx", "")
 
 
 class RecencyWeightedBasket90(RecencyWeightedBasket):
@@ -93,3 +152,18 @@ def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
 class RecencyWeightedBasket99(RecencyWeightedBasket):
     def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
         super().__init__(as_at, feature_period, 0.99)
+
+
+class RecencyWeightedApproxBasket90(RecencyWeightedApproxBasket):
+    def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
+        super().__init__(as_at, feature_period, 0.9)
+
+
+class RecencyWeightedApproxBasket95(RecencyWeightedApproxBasket):
+    def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
+        super().__init__(as_at, feature_period, 0.95)
+
+
+class RecencyWeightedApproxBasket99(RecencyWeightedApproxBasket):
+    def __init__(self, as_at: date, feature_period: FeaturePeriod) -> None:
+        super().__init__(as_at, feature_period, 0.99)
diff --git a/jstark/purchasing_feature_generator.py b/jstark/purchasing_feature_generator.py
@@ -35,6 +35,9 @@
     RecencyWeightedBasket99,
     RecencyWeightedBasket90,
     RecencyWeightedBasket95,
+    RecencyWeightedApproxBasket90,
+    RecencyWeightedApproxBasket95,
+    RecencyWeightedApproxBasket99,
     AverageBasket,
 )
 from jstark.feature_generator import FeatureGenerator
@@ -86,5 +89,8 @@ def __init__(
         RecencyWeightedBasket95,
         RecencyWeightedBasket90,
         RecencyWeightedBasket99,
+        RecencyWeightedApproxBasket95,
+        RecencyWeightedApproxBasket90,
+        RecencyWeightedApproxBasket99,
         AverageBasket,
     ]
diff --git a/tests/test_purchasing_feature_generator.py b/tests/test_purchasing_feature_generator.py
@@ -435,7 +435,7 @@ def get_dataframes_for_perweek_feature_tests(as_at_timestamp, luke_and_leia_purc
         ],
     )
     df = luke_and_leia_purchases.groupBy().agg(*fg.features)
-    df2 = df.select(
+    weeks_df = df.select(
         "BasketCount_0w0",
         "BasketCount_1w1",
         "BasketCount_2w2",
@@ -452,26 +452,28 @@ def get_dataframes_for_perweek_feature_tests(as_at_timestamp, luke_and_leia_purc
         "BasketCount_13w13",
     )
     df_first = df.first()
-    df2_first = df2.first()
+    weeks_df_first = weeks_df.first()
     assert df_first is not None
-    assert df2_first is not None
-    return df_first, df2_first
+    assert weeks_df_first is not None
+    return df_first, weeks_df_first
 
 
 def test_recencyweightedbasketweeks_luke_and_leia(
     as_at_timestamp: datetime, luke_and_leia_purchases: DataFrame
 ):
     """Test RecencyWeightedBasketWeeks
 
-    This test verifies the correct value of RecencyWeightedBasketsWeeksXX by calculating
+    This test verifies the correct value of RecencyWeightedBasketWeeksXX by calculating
     the BasketCount for each individual week, calculating the smoothed value for that ,
     week then summing all those values. This is exactly the same calculation that is
     performed by the feature generator so it might be argued that this test doesn't add
     any value. I don't agree that that is the case however, it helps to demonstrate
     exactly what this feature provides and given that its not an easy one to explain, I
     think that has some value.
+
+    A later addition to this test also verifies RecencyWeightedApproxBasketWeeksXX
     """
-    df_first, df2_first = get_dataframes_for_perweek_feature_tests(
+    df_first, df_weeks_first = get_dataframes_for_perweek_feature_tests(
         as_at_timestamp, luke_and_leia_purchases
     )
     recency_weighted_basket_count_weeks_90 = 0.0
@@ -481,13 +483,13 @@ def test_recencyweightedbasketweeks_luke_and_leia(
         # Loop over all the weeks, calculate the smoothed value for each,
         # then sum them all up
         recency_weighted_basket_count_weeks_90 += float(
-            df2_first[f"BasketCount_{i}w{i}"]
+            df_weeks_first[f"BasketCount_{i}w{i}"]
         ) * pow(0.9, i)
         recency_weighted_basket_count_weeks_95 += float(
-            df2_first[f"BasketCount_{i}w{i}"]
+            df_weeks_first[f"BasketCount_{i}w{i}"]
         ) * pow(0.95, i)
         recency_weighted_basket_count_weeks_99 += float(
-            df2_first[f"BasketCount_{i}w{i}"]
+            df_weeks_first[f"BasketCount_{i}w{i}"]
         ) * pow(0.99, i)
     assert recency_weighted_basket_count_weeks_90 == float(
         df_first["RecencyWeightedBasketWeeks90_13w0"]
@@ -498,16 +500,19 @@ def test_recencyweightedbasketweeks_luke_and_leia(
     assert recency_weighted_basket_count_weeks_99 == float(
         df_first["RecencyWeightedBasketWeeks99_13w0"]
     )
+    assert df_first["RecencyWeightedApproxBasketWeeks90_13w0"] == 3.943520489
+    assert df_first["RecencyWeightedApproxBasketWeeks95_13w0"] == 4.394755659724609
+    assert df_first["RecencyWeightedApproxBasketWeeks99_13w0"] == 4.864113257483641
 
 
 def test_averagebasketsperweek_luke_and_leia(
     as_at_timestamp: datetime, luke_and_leia_purchases: DataFrame
 ):
     """Test AverageBasketsPerWeek"""
-    df_first, df2_first = get_dataframes_for_perweek_feature_tests(
+    df_first, df_weeks_first = get_dataframes_for_perweek_feature_tests(
         as_at_timestamp, luke_and_leia_purchases
     )
     n = 14
-    total_baskets = sum(df2_first[f"BasketCount_{i}w{i}"] for i in range(n))
+    total_baskets = sum(df_weeks_first[f"BasketCount_{i}w{i}"] for i in range(n))
     average_baskets_per_week = total_baskets / n
     assert average_baskets_per_week == df_first["AverageBasketsPerWeek_13w0"]