Skip to content

Commit

Permalink
quick process to handle
Browse files Browse the repository at this point in the history
  • Loading branch information
bobkatla committed Jul 22, 2024
1 parent 10c37de commit 9b91397
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 41 deletions.
5 changes: 3 additions & 2 deletions PopSynthesis/DataProcessor/DataProcessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
convert_hh_dwell,
convert_hh_inc,
)
from PopSynthesis.DataProcessor.utils.seed.pp.process_relationships import process_rela
from PopSynthesis.DataProcessor.utils.seed.pp.process_relationships import process_rela, process_not_accept_values
from PopSynthesis.DataProcessor.utils.seed.pp.process_main_others import process_main_other
from PopSynthesis.DataProcessor.utils.seed.pp.convert_age import convert_pp_age_gr, get_main_max_age
from PopSynthesis.DataProcessor.utils.seed.pp.convert_inc import add_converted_inc
Expand Down Expand Up @@ -61,7 +61,8 @@ def process_persons_seed(self):
pp_file = find_file(base_path=self.raw_data_path, filename=pp_seed_file)
raw_hh_seed = pl.read_csv(pp_file)
pp_df = raw_hh_seed[PP_ATTS]
pp_df = process_rela(pp_df)
pp_df = process_not_accept_values(pp_df)
# pp_df = process_rela(pp_df)
# print(pp_df)
# pp_df = get_main_max_age(pp_df)
# pp_df = convert_pp_age_gr(pp_df)
Expand Down
70 changes: 31 additions & 39 deletions PopSynthesis/DataProcessor/utils/seed/pp/process_relationships.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,47 +13,39 @@ def check_rela_gb(gb_df):
elif check_dict["Self"] > 1:
print("NOOOOOOOOOO", hhid, rela_gr)


def process_not_accept_values(pp_df):
# Remove not accept value
# At the moment we remove Null and Missing for income
pp_df = pp_df.drop_nulls()
pp_df_missing = pp_df.filter(pl.col("persinc")=="Missing/Refused")
to_rm_hhid = list(pp_df_missing["hhid"].unique())
pp_df = pp_df.filter(~pl.col("hhid").is_in(to_rm_hhid))
return pp_df


def convert_simple_income(income_str):
if "Negative" in income_str:
return -1
elif "Missing" in income_str:
# This should not happen as we will filter no income
return -2
elif "Zero" in income_str:
return 0
elif "-" in income_str:
return int(income_str.split("-")[0].replace("$", ""))
elif "+" in income_str:
return 2000
else:
raise ValueError("Weird")


def process_rela(pp_df: pl.DataFrame):
# We will have 4 groups: spouse, child, grandchild and others
# To handle relationship, generally we based on income, age and gender
# First we need to make sure each HH has 1 Self
gb_df = pp_df.groupby("hhid").agg(pl.col("relationship"))
# check_rela_gb(gb_df)
print(gb_df.filter(pl.col("hhid")=="Y16H2080218"))

# There are various cases, requires some manual works
# In order of replacement: 1 person, 2 spouses, 1 spouse, no spouse then pick the oldest
# Thus we have 2 way of replacement: oldest (apply for 1 person and others) and spouse
ls_to_replace = []
for hhid, rela_gr in gb_df.rows():
check_dict = defaultdict(lambda: 0)
for i in rela_gr:
check_dict[i] += 1
if check_dict["Self"] == 0:
# There are actual cases of missing the Self person
replace_method = "oldest" if check_dict["Spouse"] == 0 else "spouse"
ls_to_replace.append((hhid, replace_method))

# start to replace to fix errors of no Self
for hhid, replace_method in ls_to_replace:
sub_df = pp_df[pp_df["hhid"] == hhid]
idx_to_replace = None
if replace_method == "spouse":
sub_sub_df = sub_df[sub_df["relationship"] == "Spouse"]
idx_to_replace = sub_sub_df.index[0]
elif replace_method == "oldest":
idx_to_replace = sub_df["age"].idxmax()
assert idx_to_replace is not None
pp_df.at[idx_to_replace, "relationship"] = "Self"

# # check again
# gb_df_2 = pp_df.groupby("hhid")["relationship"].apply(lambda x: list(x))
# check_rela_gb(gb_df_2) # Should print nothing

# # replace values in columns
# pp_df.loc[
# ~pp_df["relationship"].isin(LS_GR_RELA), "relationship"
# ] = HANDLE_THE_REST_RELA
# # print(pp_df["relationship"].unique())
income_col = pl.col("persinc")
pp_df.with_columns(pl.when)
gb_df_rela_list = pp_df.groupby("hhid").agg(pl.col("relationship"))
# First replace the first person to be Main, there should be no Self left

# return pp_df

0 comments on commit 9b91397

Please sign in to comment.