From 058d85c1d46e2d87c9cbdd798e1333d60411e385 Mon Sep 17 00:00:00 2001
From: Mamadou S Diallo <me@msdiallo.io>
Date: Sun, 5 May 2024 19:46:08 -0400
Subject: [PATCH] adding raking

---
 src/samplics/weighting/adjustment.py | 34 +++++++++++++++-------
 tests/weighting/test_adjustment.py   | 43 ++++++++++++++++++++++++++--
 2 files changed, 64 insertions(+), 13 deletions(-)

diff --git a/src/samplics/weighting/adjustment.py b/src/samplics/weighting/adjustment.py
index 1696a59a..8dba703e 100644
--- a/src/samplics/weighting/adjustment.py
+++ b/src/samplics/weighting/adjustment.py
@@ -358,6 +358,7 @@ def poststratify(
             raise AssertionError("control or factor must be specified.")
 
         if isinstance(control, dict):
+            # breakpoint()
             if (np.unique(domain) != np.unique(list(control.keys()))).any():
                 raise ValueError("control dictionary keys do not match domain values.")
 
@@ -396,6 +397,7 @@ def rake(
         ll_bound: Optional[Union[DictStrNum, Number]] = None,
         up_bound: Optional[Union[DictStrNum, Number]] = None,
         tol: float = 1e-4,
+        ctrl_tol: float = 1e-4,
         max_iter: int = 100,
         display_iter: bool = False,
     ) -> np.ndarray:
@@ -413,49 +415,58 @@ def rake(
                 print(f"\nIteration {iter + 1}")
 
             if iter == 0:
+                rk_wgt = samp_weight
                 wgt_prev = samp_weight
 
             for margin in margins:
                 domain = formats.numpy_array(margins[margin])
                 if control is not None:
-                    wgt = self.poststratify(samp_weight=wgt_prev, control=control[margin], domain=domain)
+                    rk_wgt = self.poststratify(samp_weight=rk_wgt, control=control[margin], domain=domain)
                 elif factor is not None:
-                    wgt = self.poststratify(samp_weight=wgt_prev, factor=factor[margin], domain=domain)
+                    rk_wgt = self.poststratify(samp_weight=rk_wgt, factor=factor[margin], domain=domain)
                 else:
                     raise AssertionError("control or factor must be specified!")
 
-                wgt_prev = wgt
-
             sum_wgt = {}
+            sum_prev_wgt = {}
             for margin in margins:
                 domain = formats.numpy_array(margins[margin])
                 sum_wgt_domain = {}
+                sum_prev_wgt_domain = {}
                 for d in control[margin]:
-                    sum_wgt_domain[d] = np.sum(wgt[domain == d])
+                    sum_wgt_domain[d] = np.sum(rk_wgt[domain == d])
+                    sum_prev_wgt_domain[d] = np.sum(wgt_prev[domain == d])
                 sum_wgt[margin] = sum_wgt_domain
+                sum_prev_wgt[margin] = sum_prev_wgt_domain
 
             # diff = {}
             max_diff = 0
+            max_ctrl_diff = 0
             for margin in margins:
                 if display_iter:
                     print(f"  Margin: {margin}")
 
                 diff_margin = {}
+                diff_ctrl_margin = {}
                 for d in control[margin]:
-                    diff_margin[d] = np.abs(control[margin][d] - sum_wgt[margin][d])
+                    diff_margin[d] = np.abs(sum_wgt[margin][d] - sum_prev_wgt[margin][d]) / sum_prev_wgt[margin][d]
+                    diff_ctrl_margin[d] = np.abs(sum_wgt[margin][d] - control[margin][d]) / control[margin][d]
                     if display_iter:
-                        print(f"    Difference for '{d}': {diff_margin[d]}")
+                        print(f"    Difference against previous value for '{d}': {diff_margin[d]}")
+                        print(f"    Difference against control value for '{d}': {diff_ctrl_margin[d]}")
 
                 # diff[margin] = diff_margin
                 max_diff = max(max_diff, max(diff_margin.values()))
+                max_ctrl_diff = max(max_ctrl_diff, max(diff_ctrl_margin.values()))
 
             obs_tol = max_diff
+            obs_ctrl_tol = max_ctrl_diff
 
-            if obs_tol <= tol:
+            if obs_tol <= tol and obs_ctrl_tol <= ctrl_tol:
                 converged = True
 
                 if ll_bound is not None or up_bound is not None:
-                    wgt_ratios = wgt / samp_weight
+                    wgt_ratios = rk_wgt / samp_weight
                     min_ratio = np.min(wgt_ratios)
                     max_ratio = np.max(wgt_ratios)
 
@@ -474,9 +485,12 @@ def rake(
                 else:
                     bounded = True
 
+            wgt_prev = rk_wgt
             iter += 1
 
-        return wgt
+        self.adj_method = "raking"
+
+        return rk_wgt
 
     @staticmethod
     def _calib_covariates(
diff --git a/tests/weighting/test_adjustment.py b/tests/weighting/test_adjustment.py
index 53087c36..3886fcd5 100644
--- a/tests/weighting/test_adjustment.py
+++ b/tests/weighting/test_adjustment.py
@@ -4,6 +4,40 @@
 from samplics.weighting import SampleWeight
 
 
+# stata example
+
+# nhis_sam = pl.read_csv("~/Downloads/nhis_sam.csv").with_columns(
+#     pl.when(pl.col("hisp") == 4).then(pl.lit(3)).otherwise(pl.col("hisp")).alias("hisp")
+# )
+
+# age_grp = {
+#     "<18": 5991,
+#     "18-24": 2014,
+#     "25-44": 6124,
+#     "45-64": 5011,
+#     "65+": 2448,
+# }
+# hisp_race = {1: 5031, 2: 12637, 3: 3920}
+# control = {"age_grp": age_grp, "hisp": hisp_race}
+
+# # breakpoint()
+
+# ll = 0.8
+# ul = 1.2
+
+# margins = {
+#     "age_grp": nhis_sam["age_grp"].to_list(),
+#     "hisp": nhis_sam["hisp"].to_list(),
+# }
+
+# nhis_sam_rk = SampleWeight()
+
+# nhis_sam = nhis_sam.with_columns(
+#     rake_wt_2=nhis_sam_rk.rake(
+#         samp_weight=nhis_sam["wt"], control=control, margins=margins, display_iter=True, tol=1e-6
+#     )
+# ).with_columns(diff=pl.col("rake_wt_2") - pl.col("rake_wt"))
+
 # synthetic data for testing
 
 wgt = np.random.uniform(0, 1, 1000)
@@ -275,9 +309,12 @@ def test_ps_wgt_with_class():
 
 sample_wgt_rk_not_bound = SampleWeight()
 
-rk_wgt_not_bound = sample_wgt_rk_not_bound.rake(
-    samp_weight=income_sample2["design_wgt"], control=control, margins=margins,  display_iter=True
-)
+# rk_wgt_not_bound = sample_wgt_rk_not_bound.rake(
+#     samp_weight=income_sample2["design_wgt"], control=control, margins=margins, display_iter=True, tol=1e-4
+# )
+
+
+
 # breakpoint()
 
 # age_grp = {"<18": 21588, age}