Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding the poverty dataset #75

Merged
merged 15 commits into from
Nov 16, 2023
42 changes: 32 additions & 10 deletions cities/utils/clean_variable.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import pickle
from pathlib import Path

import numpy as np
Expand Down Expand Up @@ -28,22 +27,45 @@ def clean_variable(variable_name, path_to_raw_csv, YearOrCategory="Year"):
# drop nans
variable_db = variable_db.dropna()

# check if there are any counties that are missing from unempl but in gdp
# if so, add them to exclusions, and re-run gdp with new exclusions
# Check if there are any counties that are missing from variable_db but in exclusions_df
# If so, add them to exclusions, and re-run variable_db with new exclusions

if len(np.setdiff1d(gdp["GeoFIPS"].unique(), variable_db["GeoFIPS"].unique())) > 0:
# add new exclusions

new_exclusions = np.setdiff1d(
gdp["GeoFIPS"].unique(), variable_db["GeoFIPS"].unique()
)
print("Adding new exclusions to exclusions.pkl: " + str(new_exclusions))

print("Adding new exclusions to exclusions.csv: " + str(new_exclusions))

# open exclusions file
with open("../data/raw/exclusions.pkl", "rb") as file:
exclusions = pickle.load(file)
exclusions["transport"] = np.append(exclusions["transport"], new_exclusions)
exclusions["transport"] = np.unique(exclusions["transport"])
with open("../data/raw/exclusions.pkl", "wb") as file:
pickle.dump(exclusions, file)

exclusions = pd.read_csv(os.path.join(path, "../../data/raw/exclusions.csv"))

new_rows = pd.DataFrame(
{
"dataset": [variable_name] * len(new_exclusions),
"exclusions": new_exclusions,
}
)

# Concatenate the new rows to the existing DataFrame
exclusions = pd.concat([exclusions, new_rows], ignore_index=True)

# Remove duplicates
exclusions = exclusions.drop_duplicates()

exclusions = exclusions.sort_values(by=["dataset", "exclusions"]).reset_index(
drop=True
)

exclusions.to_csv(
os.path.join(path, "../../data/raw/exclusions.csv"), index=False
)

print("Rerunning gdp cleaning with new exclusions")

# rerun gdp cleaning
clean_gdp()
clean_variable(variable_name, path_to_raw_csv)
Expand Down
3 changes: 3 additions & 0 deletions cities/utils/cleaning_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
from cities.utils.clean_transport import clean_transport
from cities.utils.clean_unemployment import clean_unemployment
from cities.utils.clean_urbanization import clean_urbanization
from cities.utils.cleaning_poverty import clean_poverty

clean_poverty()

clean_unemployment()

Expand Down
22 changes: 22 additions & 0 deletions cities/utils/cleaning_poverty.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import os
from pathlib import Path

from cities.utils.clean_variable import clean_variable

path = Path(__file__).parent.absolute()


poverty_variables = [
"povertyAll",
"povertyAllprct",
"povertyUnder18",
"povertyUnder18prct",
"medianHouseholdIncome",
]


def clean_poverty():
for variable_name in poverty_variables:
path_to_raw_csv = os.path.join(path, f"../../data/raw/{variable_name}_wide.csv")

clean_variable(variable_name, path_to_raw_csv)
Empty file added data/model_guides/.gitkeep
Empty file.
Binary file not shown.
Loading