diff --git a/PopSynthesis/DataProcessor/DataProcessor.py b/PopSynthesis/DataProcessor/DataProcessor.py index 286c845..8fa01aa 100644 --- a/PopSynthesis/DataProcessor/DataProcessor.py +++ b/PopSynthesis/DataProcessor/DataProcessor.py @@ -13,6 +13,7 @@ ) from PopSynthesis.DataProcessor.utils.const_process import ( HH_ATTS, + PP_ATTS, LS_GR_RELA, LS_HH_INC, ) @@ -23,6 +24,10 @@ convert_hh_dwell, convert_hh_inc, ) +from PopSynthesis.DataProcessor.utils.seed.pp.process_relationships import process_rela +from PopSynthesis.DataProcessor.utils.seed.pp.process_main_others import process_main_other +from PopSynthesis.DataProcessor.utils.seed.pp.convert_age import convert_pp_age_gr, get_main_max_age +from PopSynthesis.DataProcessor.utils.seed.pp.convert_inc import add_converted_inc import polars as pl @@ -38,7 +43,8 @@ def __init__( self.output_data_path = Path(output_data_src) def process_all_seed(self): - NotImplemented + hh_df = self.process_households_seed() + pp_df = self.process_persons_seed() def process_households_seed(self): # Import the hh seed data @@ -52,7 +58,15 @@ def process_households_seed(self): return hh_df def process_persons_seed(self): - NotImplemented + pp_file = find_file(base_path=self.raw_data_path, filename=pp_seed_file) + raw_hh_seed = pl.read_csv(pp_file) + pp_df = raw_hh_seed[PP_ATTS] + pp_df = process_rela(pp_df) + # print(pp_df) + # pp_df = get_main_max_age(pp_df) + # pp_df = convert_pp_age_gr(pp_df) + # pp_df = add_converted_inc(pp_df) + def process_all_census(self): NotImplemented @@ -66,4 +80,4 @@ def process_persons_census(self): if __name__ == "__main__": a = DataProcessorGeneric(raw_data_dir, processed_data_dir, output_dir) - a.process_households_seed() + a.process_persons_seed() diff --git a/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py b/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py index a305a98..6b1d33c 100644 --- a/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py +++ b/PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py @@ -1,6 +1,6 @@ from collections import defaultdict -from PopSynthesis.Methods.connect_HH_PP.scripts.const import * - +from PopSynthesis.Methods.connect_HH_PP.scripts.const import LS_GR_RELA, HANDLE_THE_REST_RELA +import polars as pl def check_rela_gb(gb_df): for hhid, rela_gr in zip(gb_df.index, gb_df): @@ -14,26 +14,27 @@ def check_rela_gb(gb_df): print("NOOOOOOOOOO", hhid, rela_gr) -def process_rela(pp_df): +def process_rela(pp_df: pl.DataFrame): # We will have 4 groups: spouse, child, grandchild and others # First we need to make sure each HH has 1 Self - - gb_df = pp_df.groupby("hhid")["relationship"].apply(lambda x: list(x)) + gb_df = pp_df.groupby("hhid").agg(pl.col("relationship")) # check_rela_gb(gb_df) + print(gb_df.filter(pl.col("hhid")=="Y16H2080218")) # There are various cases, requires some manual works # In order of replacement: 1 person, 2 spouses, 1 spouse, no spouse then pick the oldest # Thus we have 2 way of replacement: oldest (apply for 1 person and others) and spouse ls_to_replace = [] - for hhid, rela_gr in zip(gb_df.index, gb_df): + for hhid, rela_gr in gb_df.rows(): check_dict = defaultdict(lambda: 0) for i in rela_gr: check_dict[i] += 1 if check_dict["Self"] == 0: + # There are actual cases of missing the Self person replace_method = "oldest" if check_dict["Spouse"] == 0 else "spouse" ls_to_replace.append((hhid, replace_method)) - # start to replace to fix errors + # start to replace to fix errors of no Self for hhid, replace_method in ls_to_replace: sub_df = pp_df[pp_df["hhid"] == hhid] idx_to_replace = None @@ -45,14 +46,14 @@ def process_rela(pp_df): assert idx_to_replace is not None pp_df.at[idx_to_replace, "relationship"] = "Self" - # check again - gb_df_2 = pp_df.groupby("hhid")["relationship"].apply(lambda x: list(x)) - check_rela_gb(gb_df_2) # Should print nothing + # # check again + # gb_df_2 = pp_df.groupby("hhid")["relationship"].apply(lambda x: list(x)) + # check_rela_gb(gb_df_2) # Should print nothing - # replace values in columns - pp_df.loc[ - ~pp_df["relationship"].isin(LS_GR_RELA), "relationship" - ] = HANDLE_THE_REST_RELA - # print(pp_df["relationship"].unique()) + # # replace values in columns + # pp_df.loc[ + # ~pp_df["relationship"].isin(LS_GR_RELA), "relationship" + # ] = HANDLE_THE_REST_RELA + # # print(pp_df["relationship"].unique()) - return pp_df + # return pp_df diff --git a/PopSynthesis/Methods/connect_HH_PP/scripts/01_process_data.py b/PopSynthesis/Methods/connect_HH_PP/scripts/01_process_data.py index 9b6ff76..1f3b6e9 100644 --- a/PopSynthesis/Methods/connect_HH_PP/scripts/01_process_data.py +++ b/PopSynthesis/Methods/connect_HH_PP/scripts/01_process_data.py @@ -8,13 +8,6 @@ from PopSynthesis.Methods.connect_HH_PP.paras_dir import data_dir, processed_data from PopSynthesis.Methods.connect_HH_PP.scripts.const import * -def convert_all_hh_atts(hh_df, pp_df): - hh_df = adding_pp_related_atts(hh_df, pp_df) - hh_df = convert_hh_totvehs(hh_df) - hh_df = convert_hh_inc(hh_df, check_states=LS_HH_INC) - hh_df = convert_hh_dwell(hh_df) - hh_df = convert_hh_size(hh_df) - return hh_df def main():