Skip to content

Commit

Permalink
Merge pull request #29 from BasisResearch/nl-add-transportation-spend…
Browse files Browse the repository at this point in the history
…ings

Cleaning spending datasets from us spending
  • Loading branch information
rfl-urbaniak authored Oct 23, 2023
2 parents af5a62c + d0c3922 commit 40808fa
Show file tree
Hide file tree
Showing 44 changed files with 377,105 additions and 162,612 deletions.
4 changes: 2 additions & 2 deletions cities/utils/clean_population.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@

def clean_population():
data = DataGrabber()
data.get_gdp_wide()
gdp = data.gdp_wide
data.get_features_wide(["gdp"])
gdp = data.wide["gdp"]

cainc30 = pd.read_csv("../data/raw/CAINC30_1969_2021.csv", encoding="ISO-8859-1")

Expand Down
133 changes: 133 additions & 0 deletions cities/utils/clean_spending_HHS.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import numpy as np
import pandas as pd

from cities.utils.cleaning_utils import standardize_and_scale
from cities.utils.data_grabber import DataGrabber


def clean_spending_HHS():
data = DataGrabber()
data.get_features_wide(["gdp"])
gdp = data.wide
gdp = gdp.get("gdp")

spending_HHS = pd.read_csv("../data/raw/spending_HHS.csv")

transportUnwanted = spending_HHS[
(
pd.isna(spending_HHS["total_obligated_amount"])
| (spending_HHS["total_obligated_amount"] == 1)
| (spending_HHS["total_obligated_amount"] == 0)
)
]

exclude_mask = spending_HHS["total_obligated_amount"].isin(
transportUnwanted["total_obligated_amount"]
)
spending_HHS = spending_HHS[~exclude_mask] # 95 observations dleted

assert spending_HHS.isna().sum().sum() == 0, "Na values detected"

# loading names and repearing fips of value 3 and shorter

names_HHS = pd.read_csv("../data/raw/spending_HHS_names.csv")

spending_only_fips = np.setdiff1d(spending_HHS["GeoFIPS"], gdp["GeoFIPS"])

fips4_to_repair = [fip for fip in spending_only_fips if (fip < 10000 and fip > 999)]
short4_fips = spending_HHS[spending_HHS["GeoFIPS"].isin(fips4_to_repair)]

full_geofipsLIST = [fip for fip in spending_only_fips if fip > 9999]
full_geofips = spending_HHS[spending_HHS["GeoFIPS"].isin(full_geofipsLIST)]

cleaningLIST = [full_geofips, short4_fips] # no 3digit FIPS

# replacing damaged FIPS

for badFIPS in cleaningLIST:
geofips_to_geonamealt = dict(zip(names_HHS["GeoFIPS"], names_HHS["GeoNameALT"]))

badFIPS["GeoNameALT"] = badFIPS["GeoFIPS"].map(geofips_to_geonamealt)
badFIPS = badFIPS.rename(columns={"GeoFIPS": "damagedFIPS"})

badFIPSmapping_dict = dict(zip(gdp["GeoName"], gdp["GeoFIPS"]))

badFIPS["repairedFIPS"] = badFIPS["GeoNameALT"].apply(
lambda x: badFIPSmapping_dict.get(x)
)
repaired_geofips = badFIPS[badFIPS["repairedFIPS"].notna()]

repair_ratio = repaired_geofips.shape[0] / badFIPS.shape[0]
print(f"Ratio of repaired FIPS: {round(repair_ratio, 2)}")

# assert repair_ratio > 0.9, f'Less than 0.9 of FIPS were successfully repaired!'

spending_HHS["GeoFIPS"] = spending_HHS[
"GeoFIPS"
].replace( # no FIPS were repaired actually
dict(zip(repaired_geofips["damagedFIPS"], repaired_geofips["repairedFIPS"]))
)

common_fips = np.intersect1d(
gdp["GeoFIPS"].unique(), spending_HHS["GeoFIPS"].unique()
)

all_FIPS_spending_HHS = spending_HHS.copy()

spending_HHS = spending_HHS[
spending_HHS["GeoFIPS"].isin(common_fips)
] # 99 FIPS deleted
assert (
spending_HHS.shape[0] / all_FIPS_spending_HHS.shape[0] > 0.9
), "Less than 0.9 of FIPS are common!"

# grouping duplicate fips for years
# (they appeared because we have repaired some of them and now they match with number that is already present)

spending_HHS = (
spending_HHS.groupby(["GeoFIPS", "year"])["total_obligated_amount"]
.sum()
.reset_index()
)
spending_HHS.reset_index(drop=True, inplace=True)

# adding GeoNames
spending_HHS = spending_HHS.merge(
gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left"
)[["GeoFIPS", "GeoName", "year", "total_obligated_amount"]]

unique_gdp = gdp[["GeoFIPS", "GeoName"]].drop_duplicates(
subset=["GeoFIPS", "GeoName"], keep="first"
)
exclude_geofips = set(spending_HHS["GeoFIPS"])
unique_gdp = unique_gdp[~unique_gdp["GeoFIPS"].isin(exclude_geofips)]

unique_gdp["year"] = np.repeat(2018, unique_gdp.shape[0])
unique_gdp["total_obligated_amount"] = np.repeat(0, unique_gdp.shape[0])
spending_HHS = pd.concat([spending_HHS, unique_gdp], ignore_index=True)
spending_HHS = spending_HHS.sort_values(by=["GeoFIPS", "GeoName", "year"])

assert spending_HHS["GeoFIPS"].nunique() == spending_HHS["GeoName"].nunique()
assert spending_HHS["GeoFIPS"].nunique() == gdp["GeoFIPS"].nunique()

# standardizing and saving
spending_HHS_long = spending_HHS.copy()

spending_HHS_wide = spending_HHS.pivot_table(
index=["GeoFIPS", "GeoName"], columns="year", values="total_obligated_amount"
)
spending_HHS_wide.reset_index(inplace=True)
spending_HHS_wide.columns.name = None
spending_HHS_wide = spending_HHS_wide.fillna(0)

spending_HHS_std_long = standardize_and_scale(spending_HHS)
spending_HHS_std_wide = standardize_and_scale(spending_HHS_wide)

spending_HHS_wide.to_csv("../data/processed/spending_HHS_wide.csv", index=False)
spending_HHS_long.to_csv("../data/processed/spending_HHS_long.csv", index=False)
spending_HHS_std_wide.to_csv(
"../data/processed/spending_HHS_std_wide.csv", index=False
)
spending_HHS_std_long.to_csv(
"../data/processed/spending_HHS_std_long.csv", index=False
)
143 changes: 143 additions & 0 deletions cities/utils/clean_spending_commerce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
import numpy as np
import pandas as pd

from cities.utils.cleaning_utils import standardize_and_scale
from cities.utils.data_grabber import DataGrabber


def clean_spending_commerce():
data = DataGrabber()
data.get_features_wide(["gdp"])
gdp = data.wide
gdp = gdp.get("gdp")

spending_commerce = pd.read_csv("../data/raw/spending_commerce.csv")

transportUnwanted = spending_commerce[
(
pd.isna(spending_commerce["total_obligated_amount"])
| (spending_commerce["total_obligated_amount"] == 1)
| (spending_commerce["total_obligated_amount"] == 0)
)
]

exclude_mask = spending_commerce["total_obligated_amount"].isin(
transportUnwanted["total_obligated_amount"]
)
spending_commerce = spending_commerce[~exclude_mask] # 24 values lost

assert spending_commerce.isna().sum().sum() == 0, "Na values detected"

# loading names and repearing fips of value 3 and shorter

names_commerce = pd.read_csv("../data/raw/spending_commerce_names.csv")

spending_only_fips = np.setdiff1d(spending_commerce["GeoFIPS"], gdp["GeoFIPS"])

fips4_to_repair = [fip for fip in spending_only_fips if (fip < 10000 and fip > 999)]
short4_fips = spending_commerce[spending_commerce["GeoFIPS"].isin(fips4_to_repair)]

full_geofipsLIST = [fip for fip in spending_only_fips if fip > 9999]
full_geofips = spending_commerce[
spending_commerce["GeoFIPS"].isin(full_geofipsLIST)
]

cleaningLIST = [full_geofips, short4_fips] # no small fips

# replacing damaged FIPS

for badFIPS in cleaningLIST:
geofips_to_geonamealt = dict(
zip(names_commerce["GeoFIPS"], names_commerce["GeoNameALT"])
)

badFIPS["GeoNameALT"] = badFIPS["GeoFIPS"].map(geofips_to_geonamealt)
badFIPS = badFIPS.rename(columns={"GeoFIPS": "damagedFIPS"})

badFIPSmapping_dict = dict(zip(gdp["GeoName"], gdp["GeoFIPS"]))

badFIPS["repairedFIPS"] = badFIPS["GeoNameALT"].apply(
lambda x: badFIPSmapping_dict.get(x)
)
repaired_geofips = badFIPS[badFIPS["repairedFIPS"].notna()]

repair_ratio = repaired_geofips.shape[0] / badFIPS.shape[0]
print(f"Ratio of repaired FIPS: {round(repair_ratio, 2)}")

# assert repair_ratio > 0.9, f'Less than 0.9 of FIPS were successfully repaired!'

spending_commerce["GeoFIPS"] = spending_commerce["GeoFIPS"].replace(
dict(zip(repaired_geofips["damagedFIPS"], repaired_geofips["repairedFIPS"]))
)

# deleting short FIPS codes

common_fips = np.intersect1d(
gdp["GeoFIPS"].unique(), spending_commerce["GeoFIPS"].unique()
)

all_FIPS_spending_commerce = spending_commerce.copy()

spending_commerce = spending_commerce[
spending_commerce["GeoFIPS"].isin(common_fips)
] # 67 FIPS deleted
assert (
spending_commerce.shape[0] / all_FIPS_spending_commerce.shape[0] > 0.9
), "Less than 0.9 of FIPS are common!"

# grouping duplicate fips for years
# (they appeared because we have repaired some of them and now they match with number that is already present)

spending_commerce = (
spending_commerce.groupby(["GeoFIPS", "year"])["total_obligated_amount"]
.sum()
.reset_index()
)
spending_commerce.reset_index(drop=True, inplace=True)

# adding GeoNames
spending_commerce = spending_commerce.merge(
gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left"
)[["GeoFIPS", "GeoName", "year", "total_obligated_amount"]]

unique_gdp = gdp[["GeoFIPS", "GeoName"]].drop_duplicates(
subset=["GeoFIPS", "GeoName"], keep="first"
)
exclude_geofips = set(spending_commerce["GeoFIPS"])
unique_gdp = unique_gdp[~unique_gdp["GeoFIPS"].isin(exclude_geofips)]

unique_gdp["year"] = np.repeat(2018, unique_gdp.shape[0])
unique_gdp["total_obligated_amount"] = np.repeat(0, unique_gdp.shape[0])
spending_commerce = pd.concat([spending_commerce, unique_gdp], ignore_index=True)
spending_commerce = spending_commerce.sort_values(by=["GeoFIPS", "GeoName", "year"])

assert (
spending_commerce["GeoFIPS"].nunique() == spending_commerce["GeoName"].nunique()
)
assert spending_commerce["GeoFIPS"].nunique() == gdp["GeoFIPS"].nunique()

# standardizing and saving
spending_commerce_long = spending_commerce.copy()

spending_commerce_wide = spending_commerce.pivot_table(
index=["GeoFIPS", "GeoName"], columns="year", values="total_obligated_amount"
)
spending_commerce_wide.reset_index(inplace=True)
spending_commerce_wide.columns.name = None
spending_commerce_wide = spending_commerce_wide.fillna(0)

spending_commerce_std_long = standardize_and_scale(spending_commerce)
spending_commerce_std_wide = standardize_and_scale(spending_commerce_wide)

spending_commerce_wide.to_csv(
"../data/processed/spending_commerce_wide.csv", index=False
)
spending_commerce_long.to_csv(
"../data/processed/spending_commerce_long.csv", index=False
)
spending_commerce_std_wide.to_csv(
"../data/processed/spending_commerce_std_wide.csv", index=False
)
spending_commerce_std_long.to_csv(
"../data/processed/spending_commerce_std_long.csv", index=False
)
Loading

0 comments on commit 40808fa

Please sign in to comment.