Skip to content

Commit

Permalink
middle processing the pp
Browse files Browse the repository at this point in the history
  • Loading branch information
bobkatla committed Jul 15, 2024
1 parent 9e26c54 commit 9dc8f97
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 26 deletions.
20 changes: 17 additions & 3 deletions PopSynthesis/DataProcessor/DataProcessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
)
from PopSynthesis.DataProcessor.utils.const_process import (
HH_ATTS,
PP_ATTS,
LS_GR_RELA,
LS_HH_INC,
)
Expand All @@ -23,6 +24,10 @@
convert_hh_dwell,
convert_hh_inc,
)
from PopSynthesis.DataProcessor.utils.seed.pp.process_relationships import process_rela
from PopSynthesis.DataProcessor.utils.seed.pp.process_main_others import process_main_other
from PopSynthesis.DataProcessor.utils.seed.pp.convert_age import convert_pp_age_gr, get_main_max_age
from PopSynthesis.DataProcessor.utils.seed.pp.convert_inc import add_converted_inc
import polars as pl


Expand All @@ -38,7 +43,8 @@ def __init__(
self.output_data_path = Path(output_data_src)

def process_all_seed(self):
NotImplemented
hh_df = self.process_households_seed()
pp_df = self.process_persons_seed()

def process_households_seed(self):
# Import the hh seed data
Expand All @@ -52,7 +58,15 @@ def process_households_seed(self):
return hh_df

def process_persons_seed(self):
NotImplemented
pp_file = find_file(base_path=self.raw_data_path, filename=pp_seed_file)
raw_hh_seed = pl.read_csv(pp_file)
pp_df = raw_hh_seed[PP_ATTS]
pp_df = process_rela(pp_df)
# print(pp_df)
# pp_df = get_main_max_age(pp_df)
# pp_df = convert_pp_age_gr(pp_df)
# pp_df = add_converted_inc(pp_df)


def process_all_census(self):
NotImplemented
Expand All @@ -66,4 +80,4 @@ def process_persons_census(self):

if __name__ == "__main__":
a = DataProcessorGeneric(raw_data_dir, processed_data_dir, output_dir)
a.process_households_seed()
a.process_persons_seed()
33 changes: 17 additions & 16 deletions PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import defaultdict
from PopSynthesis.Methods.connect_HH_PP.scripts.const import *

from PopSynthesis.Methods.connect_HH_PP.scripts.const import LS_GR_RELA, HANDLE_THE_REST_RELA
import polars as pl

def check_rela_gb(gb_df):
for hhid, rela_gr in zip(gb_df.index, gb_df):
Expand All @@ -14,26 +14,27 @@ def check_rela_gb(gb_df):
print("NOOOOOOOOOO", hhid, rela_gr)


def process_rela(pp_df):
def process_rela(pp_df: pl.DataFrame):
# We will have 4 groups: spouse, child, grandchild and others
# First we need to make sure each HH has 1 Self

gb_df = pp_df.groupby("hhid")["relationship"].apply(lambda x: list(x))
gb_df = pp_df.groupby("hhid").agg(pl.col("relationship"))
# check_rela_gb(gb_df)
print(gb_df.filter(pl.col("hhid")=="Y16H2080218"))

# There are various cases, requires some manual works
# In order of replacement: 1 person, 2 spouses, 1 spouse, no spouse then pick the oldest
# Thus we have 2 way of replacement: oldest (apply for 1 person and others) and spouse
ls_to_replace = []
for hhid, rela_gr in zip(gb_df.index, gb_df):
for hhid, rela_gr in gb_df.rows():
check_dict = defaultdict(lambda: 0)
for i in rela_gr:
check_dict[i] += 1
if check_dict["Self"] == 0:
# There are actual cases of missing the Self person
replace_method = "oldest" if check_dict["Spouse"] == 0 else "spouse"
ls_to_replace.append((hhid, replace_method))

# start to replace to fix errors
# start to replace to fix errors of no Self
for hhid, replace_method in ls_to_replace:
sub_df = pp_df[pp_df["hhid"] == hhid]
idx_to_replace = None
Expand All @@ -45,14 +46,14 @@ def process_rela(pp_df):
assert idx_to_replace is not None
pp_df.at[idx_to_replace, "relationship"] = "Self"

# check again
gb_df_2 = pp_df.groupby("hhid")["relationship"].apply(lambda x: list(x))
check_rela_gb(gb_df_2) # Should print nothing
# # check again
# gb_df_2 = pp_df.groupby("hhid")["relationship"].apply(lambda x: list(x))
# check_rela_gb(gb_df_2) # Should print nothing

# replace values in columns
pp_df.loc[
~pp_df["relationship"].isin(LS_GR_RELA), "relationship"
] = HANDLE_THE_REST_RELA
# print(pp_df["relationship"].unique())
# # replace values in columns
# pp_df.loc[
# ~pp_df["relationship"].isin(LS_GR_RELA), "relationship"
# ] = HANDLE_THE_REST_RELA
# # print(pp_df["relationship"].unique())

return pp_df
# return pp_df
7 changes: 0 additions & 7 deletions PopSynthesis/Methods/connect_HH_PP/scripts/01_process_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,6 @@
from PopSynthesis.Methods.connect_HH_PP.paras_dir import data_dir, processed_data
from PopSynthesis.Methods.connect_HH_PP.scripts.const import *

def convert_all_hh_atts(hh_df, pp_df):
hh_df = adding_pp_related_atts(hh_df, pp_df)
hh_df = convert_hh_totvehs(hh_df)
hh_df = convert_hh_inc(hh_df, check_states=LS_HH_INC)
hh_df = convert_hh_dwell(hh_df)
hh_df = convert_hh_size(hh_df)
return hh_df


def main():
Expand Down

0 comments on commit 9dc8f97

Please sign in to comment.