From 9b91397993d70b4d01c676cecbda45815646150d Mon Sep 17 00:00:00 2001 From: Duc Minh La Date: Tue, 23 Jul 2024 00:08:37 +1000 Subject: [PATCH] quick process to handle --- PopSynthesis/DataProcessor/DataProcessor.py | 5 +- .../utils/seed/pp/process_relationships.py | 70 ++++++++----------- 2 files changed, 34 insertions(+), 41 deletions(-) diff --git a/PopSynthesis/DataProcessor/DataProcessor.py b/PopSynthesis/DataProcessor/DataProcessor.py index 8fa01aa..5a08cd2 100644 --- a/PopSynthesis/DataProcessor/DataProcessor.py +++ b/PopSynthesis/DataProcessor/DataProcessor.py @@ -24,7 +24,7 @@ convert_hh_dwell, convert_hh_inc, ) -from PopSynthesis.DataProcessor.utils.seed.pp.process_relationships import process_rela +from PopSynthesis.DataProcessor.utils.seed.pp.process_relationships import process_rela, process_not_accept_values from PopSynthesis.DataProcessor.utils.seed.pp.process_main_others import process_main_other from PopSynthesis.DataProcessor.utils.seed.pp.convert_age import convert_pp_age_gr, get_main_max_age from PopSynthesis.DataProcessor.utils.seed.pp.convert_inc import add_converted_inc @@ -61,7 +61,8 @@ def process_persons_seed(self): pp_file = find_file(base_path=self.raw_data_path, filename=pp_seed_file) raw_hh_seed = pl.read_csv(pp_file) pp_df = raw_hh_seed[PP_ATTS] - pp_df = process_rela(pp_df) + pp_df = process_not_accept_values(pp_df) + # pp_df = process_rela(pp_df) # print(pp_df) # pp_df = get_main_max_age(pp_df) # pp_df = convert_pp_age_gr(pp_df) diff --git a/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py b/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py index 6b1d33c..756d04f 100644 --- a/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py +++ b/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py @@ -13,47 +13,39 @@ def check_rela_gb(gb_df): elif check_dict["Self"] > 1: print("NOOOOOOOOOO", hhid, rela_gr) + +def process_not_accept_values(pp_df): + # Remove not accept value + # At the moment we remove Null and Missing for income + pp_df = pp_df.drop_nulls() + pp_df_missing = pp_df.filter(pl.col("persinc")=="Missing/Refused") + to_rm_hhid = list(pp_df_missing["hhid"].unique()) + pp_df = pp_df.filter(~pl.col("hhid").is_in(to_rm_hhid)) + return pp_df + + +def convert_simple_income(income_str): + if "Negative" in income_str: + return -1 + elif "Missing" in income_str: + # This should not happen as we will filter no income + return -2 + elif "Zero" in income_str: + return 0 + elif "-" in income_str: + return int(income_str.split("-")[0].replace("$", "")) + elif "+" in income_str: + return 2000 + else: + raise ValueError("Weird") + def process_rela(pp_df: pl.DataFrame): - # We will have 4 groups: spouse, child, grandchild and others + # To handle relationship, generally we based on income, age and gender # First we need to make sure each HH has 1 Self - gb_df = pp_df.groupby("hhid").agg(pl.col("relationship")) - # check_rela_gb(gb_df) - print(gb_df.filter(pl.col("hhid")=="Y16H2080218")) - - # There are various cases, requires some manual works - # In order of replacement: 1 person, 2 spouses, 1 spouse, no spouse then pick the oldest - # Thus we have 2 way of replacement: oldest (apply for 1 person and others) and spouse - ls_to_replace = [] - for hhid, rela_gr in gb_df.rows(): - check_dict = defaultdict(lambda: 0) - for i in rela_gr: - check_dict[i] += 1 - if check_dict["Self"] == 0: - # There are actual cases of missing the Self person - replace_method = "oldest" if check_dict["Spouse"] == 0 else "spouse" - ls_to_replace.append((hhid, replace_method)) - - # start to replace to fix errors of no Self - for hhid, replace_method in ls_to_replace: - sub_df = pp_df[pp_df["hhid"] == hhid] - idx_to_replace = None - if replace_method == "spouse": - sub_sub_df = sub_df[sub_df["relationship"] == "Spouse"] - idx_to_replace = sub_sub_df.index[0] - elif replace_method == "oldest": - idx_to_replace = sub_df["age"].idxmax() - assert idx_to_replace is not None - pp_df.at[idx_to_replace, "relationship"] = "Self" - - # # check again - # gb_df_2 = pp_df.groupby("hhid")["relationship"].apply(lambda x: list(x)) - # check_rela_gb(gb_df_2) # Should print nothing - - # # replace values in columns - # pp_df.loc[ - # ~pp_df["relationship"].isin(LS_GR_RELA), "relationship" - # ] = HANDLE_THE_REST_RELA - # # print(pp_df["relationship"].unique()) + income_col = pl.col("persinc") + pp_df.with_columns(pl.when) + gb_df_rela_list = pp_df.groupby("hhid").agg(pl.col("relationship")) + # First replace the first person to be Main, there should be no Self left # return pp_df