From 9b91397993d70b4d01c676cecbda45815646150d Mon Sep 17 00:00:00 2001
From: Duc Minh La <dminh.bob.la@gmail.com>
Date: Tue, 23 Jul 2024 00:08:37 +1000
Subject: [PATCH] quick process to handle

---
 PopSynthesis/DataProcessor/DataProcessor.py   |  5 +-
 .../utils/seed/pp/process_relationships.py    | 70 ++++++++-----------
 2 files changed, 34 insertions(+), 41 deletions(-)

diff --git a/PopSynthesis/DataProcessor/DataProcessor.py b/PopSynthesis/DataProcessor/DataProcessor.py
index 8fa01aa..5a08cd2 100644
--- a/PopSynthesis/DataProcessor/DataProcessor.py
+++ b/PopSynthesis/DataProcessor/DataProcessor.py
@@ -24,7 +24,7 @@
     convert_hh_dwell,
     convert_hh_inc,
 )
-from PopSynthesis.DataProcessor.utils.seed.pp.process_relationships import process_rela
+from PopSynthesis.DataProcessor.utils.seed.pp.process_relationships import process_rela, process_not_accept_values
 from PopSynthesis.DataProcessor.utils.seed.pp.process_main_others import process_main_other
 from PopSynthesis.DataProcessor.utils.seed.pp.convert_age import convert_pp_age_gr, get_main_max_age
 from PopSynthesis.DataProcessor.utils.seed.pp.convert_inc import add_converted_inc
@@ -61,7 +61,8 @@ def process_persons_seed(self):
         pp_file = find_file(base_path=self.raw_data_path, filename=pp_seed_file)
         raw_hh_seed = pl.read_csv(pp_file)
         pp_df = raw_hh_seed[PP_ATTS]
-        pp_df = process_rela(pp_df)
+        pp_df = process_not_accept_values(pp_df)
+        # pp_df = process_rela(pp_df)
         # print(pp_df)
         # pp_df = get_main_max_age(pp_df)
         # pp_df = convert_pp_age_gr(pp_df)
diff --git a/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py b/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py
index 6b1d33c..756d04f 100644
--- a/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py
+++ b/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py
@@ -13,47 +13,39 @@ def check_rela_gb(gb_df):
         elif check_dict["Self"] > 1:
             print("NOOOOOOOOOO", hhid, rela_gr)
 
+    
+def process_not_accept_values(pp_df):
+    # Remove not accept value
+    # At the moment we remove Null and Missing for income
+    pp_df = pp_df.drop_nulls()
+    pp_df_missing = pp_df.filter(pl.col("persinc")=="Missing/Refused")
+    to_rm_hhid = list(pp_df_missing["hhid"].unique())
+    pp_df = pp_df.filter(~pl.col("hhid").is_in(to_rm_hhid))
+    return pp_df
+
+
+def convert_simple_income(income_str):
+    if "Negative" in income_str:
+        return -1
+    elif "Missing" in income_str:
+        # This should not happen as we will filter no income
+        return -2
+    elif "Zero" in income_str:
+        return 0
+    elif "-" in income_str:
+        return int(income_str.split("-")[0].replace("$", ""))
+    elif "+" in income_str:
+        return 2000
+    else:
+        raise ValueError("Weird")
+
 
 def process_rela(pp_df: pl.DataFrame):
-    # We will have 4 groups: spouse, child, grandchild and others
+    # To handle relationship, generally we based on income, age and gender
     # First we need to make sure each HH has 1 Self
-    gb_df = pp_df.groupby("hhid").agg(pl.col("relationship"))
-    # check_rela_gb(gb_df)
-    print(gb_df.filter(pl.col("hhid")=="Y16H2080218"))
-
-    # There are various cases, requires some manual works
-    # In order of replacement: 1 person, 2 spouses, 1 spouse, no spouse then pick the oldest
-    # Thus we have 2 way of replacement: oldest (apply for 1 person and others) and spouse
-    ls_to_replace = []
-    for hhid, rela_gr in gb_df.rows():
-        check_dict = defaultdict(lambda: 0)
-        for i in rela_gr:
-            check_dict[i] += 1
-        if check_dict["Self"] == 0:
-            # There are actual cases of missing the Self person
-            replace_method = "oldest" if check_dict["Spouse"] == 0 else "spouse"
-            ls_to_replace.append((hhid, replace_method))
-
-    # start to replace to fix errors of no Self
-    for hhid, replace_method in ls_to_replace:
-        sub_df = pp_df[pp_df["hhid"] == hhid]
-        idx_to_replace = None
-        if replace_method == "spouse":
-            sub_sub_df = sub_df[sub_df["relationship"] == "Spouse"]
-            idx_to_replace = sub_sub_df.index[0]
-        elif replace_method == "oldest":
-            idx_to_replace = sub_df["age"].idxmax()
-        assert idx_to_replace is not None
-        pp_df.at[idx_to_replace, "relationship"] = "Self"
-
-    # # check again
-    # gb_df_2 = pp_df.groupby("hhid")["relationship"].apply(lambda x: list(x))
-    # check_rela_gb(gb_df_2)  # Should print nothing
-
-    # # replace values in columns
-    # pp_df.loc[
-    #     ~pp_df["relationship"].isin(LS_GR_RELA), "relationship"
-    # ] = HANDLE_THE_REST_RELA
-    # # print(pp_df["relationship"].unique())
+    income_col = pl.col("persinc")
+    pp_df.with_columns(pl.when)
+    gb_df_rela_list = pp_df.groupby("hhid").agg(pl.col("relationship"))
+    # First replace the first person to be Main, there should be no Self left
 
     # return pp_df