Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleaning spending datasets from us spending #29

Merged
merged 12 commits into from
Oct 23, 2023
107 changes: 107 additions & 0 deletions cities/utils/clean_spending_HHS.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import numpy as np
import pandas as pd

from cities.utils.cleaning_utils import standardize_and_scale
from cities.utils.data_grabber import DataGrabber



def clean_spending_HHS():
data = DataGrabber()
data.get_features_wide(["gdp"])
gdp = data.wide
gdp = gdp.get("gdp")


spending_HHS = pd.read_csv("../data/raw/spending_HHS.csv")


transportUnwanted = spending_HHS[(pd.isna(spending_HHS['total_obligated_amount']) |
(spending_HHS['total_obligated_amount'] == 1) |
(spending_HHS['total_obligated_amount'] == 0))]

exclude_mask = spending_HHS['total_obligated_amount'].isin(transportUnwanted['total_obligated_amount'])
spending_HHS = spending_HHS[~exclude_mask] # 95 observations dleted

assert spending_HHS.isna().sum().sum() == 0, 'Na values detected'

# loading names and repearing fips of value 3 and shorter

names_HHS = pd.read_csv("../data/raw/spending_HHS_names.csv")

spending_only_fips = np.setdiff1d(spending_HHS['GeoFIPS'], gdp['GeoFIPS'])

fips4_to_repeair = [fip for fip in spending_only_fips if (fip < 10000 and fip > 999)]
short4_fips = spending_HHS[spending_HHS['GeoFIPS'].isin(fips4_to_repeair)]

full_geofipsLIST = [fip for fip in spending_only_fips if fip > 9999]
full_geofips = spending_HHS[spending_HHS['GeoFIPS'].isin(full_geofipsLIST)]

cleaningLIST = [full_geofips, short4_fips] # no 3digit FIPS

# replacing damaged FIPS

for badFIPS in cleaningLIST:

geofips_to_geonamealt = dict(zip(names_HHS['GeoFIPS'], names_HHS['GeoNameALT']))

badFIPS['GeoNameALT'] = badFIPS['GeoFIPS'].map(geofips_to_geonamealt)
badFIPS = badFIPS.rename(columns={'GeoFIPS': 'damagedFIPS'})

badFIPSmapping_dict = dict(zip(gdp['GeoName'], gdp['GeoFIPS']))

badFIPS['repairedFIPS'] = badFIPS['GeoNameALT'].apply(lambda x: badFIPSmapping_dict.get(x))
repaired_geofips = badFIPS[badFIPS['repairedFIPS'].notna()]


repair_ratio = (repaired_geofips.shape[0] / badFIPS.shape[0])
print(f'Ratio of repaired FIPS: {round(repair_ratio, 2)}')

#assert repair_ratio > 0.9, f'Less than 0.9 of FIPS were successfully repaired!'

spending_HHS['GeoFIPS'] = spending_HHS['GeoFIPS'].replace( # no FIPS were repaired actually
dict(zip(repaired_geofips['damagedFIPS'], repaired_geofips['repairedFIPS']))
)

Niklewa marked this conversation as resolved.
Show resolved Hide resolved
common_fips = np.intersect1d(gdp['GeoFIPS'].unique(),
spending_HHS['GeoFIPS'].unique())

all_FIPS_spending_HHS = spending_HHS.copy()

spending_HHS = spending_HHS[spending_HHS["GeoFIPS"].isin(common_fips)] # 99 FIPS deleted
assert spending_HHS.shape[0] / all_FIPS_spending_HHS.shape[0] > 0.9, 'Less than 0.9 of FIPS are common!'


# grouping duplicate fips for years
# (they appeared because we have repaired some of them and now they mathch with number that is already present)


spending_HHS = spending_HHS.groupby(['GeoFIPS', 'year'])['total_obligated_amount'].sum().reset_index()
spending_HHS.reset_index(drop=True, inplace=True)


# adding GeoNames
spending_HHS = spending_HHS.merge(gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left")[
["GeoFIPS", "GeoName", "year", "total_obligated_amount"]
]


# standardizing and saving
spending_HHS_long = spending_HHS.copy()

spending_HHS_wide = spending_HHS.pivot_table(index=['GeoFIPS', 'GeoName'], columns='year',
values='total_obligated_amount')
spending_HHS_wide.reset_index(inplace=True)
spending_HHS_wide.columns.name = None
spending_HHS_wide = spending_HHS_wide.fillna(0)

spending_HHS_std_long = standardize_and_scale(spending_HHS)
spending_HHS_std_wide = standardize_and_scale(spending_HHS_wide)


spending_HHS_wide.to_csv("../data/processed/spending_HHS_wide.csv", index=False)
spending_HHS_long.to_csv("../data/processed/spending_HHS_long.csv", index=False)
spending_HHS_std_wide.to_csv("../data/processed/spending_HHS_std_wide.csv", index=False)
spending_HHS_std_long.to_csv("../data/processed/spending_HHS_std_long.csv", index=False)


109 changes: 109 additions & 0 deletions cities/utils/clean_spending_commerce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
import numpy as np
import pandas as pd

from cities.utils.cleaning_utils import standardize_and_scale
from cities.utils.data_grabber import DataGrabber



def clean_spending_commerce():
data = DataGrabber()
data.get_features_wide(["gdp"])
gdp = data.wide
gdp = gdp.get("gdp")


spending_commerce = pd.read_csv("data/raw/spending_commerce.csv")


transportUnwanted = spending_commerce[(pd.isna(spending_commerce['total_obligated_amount']) |
(spending_commerce['total_obligated_amount'] == 1) |
(spending_commerce['total_obligated_amount'] == 0))]

exclude_mask = spending_commerce['total_obligated_amount'].isin(transportUnwanted['total_obligated_amount'])
spending_commerce = spending_commerce[~exclude_mask] # 24 values lost

assert spending_commerce.isna().sum().sum() == 0, 'Na values detected'

# loading names and repearing fips of value 3 and shorter

names_commerce = pd.read_csv("data/raw/spending_commerce_names.csv")

spending_only_fips = np.setdiff1d(spending_commerce['GeoFIPS'], gdp['GeoFIPS'])

fips4_to_repeair = [fip for fip in spending_only_fips if (fip < 10000 and fip > 999)]
short4_fips = spending_commerce[spending_commerce['GeoFIPS'].isin(fips4_to_repeair)]

full_geofipsLIST = [fip for fip in spending_only_fips if fip > 9999]
full_geofips = spending_commerce[spending_commerce['GeoFIPS'].isin(full_geofipsLIST)]

cleaningLIST = [full_geofips, short4_fips] # no small fips

# replacing damaged FIPS

for badFIPS in cleaningLIST:

geofips_to_geonamealt = dict(zip(names_commerce['GeoFIPS'], names_commerce['GeoNameALT']))

badFIPS['GeoNameALT'] = badFIPS['GeoFIPS'].map(geofips_to_geonamealt)
badFIPS = badFIPS.rename(columns={'GeoFIPS': 'damagedFIPS'})

badFIPSmapping_dict = dict(zip(gdp['GeoName'], gdp['GeoFIPS']))

badFIPS['repairedFIPS'] = badFIPS['GeoNameALT'].apply(lambda x: badFIPSmapping_dict.get(x))
repaired_geofips = badFIPS[badFIPS['repairedFIPS'].notna()]


repair_ratio = (repaired_geofips.shape[0] / badFIPS.shape[0])
print(f'Ratio of repaired FIPS: {round(repair_ratio, 2)}')

#assert repair_ratio > 0.9, f'Less than 0.9 of FIPS were successfully repaired!'

spending_commerce['GeoFIPS'] = spending_commerce['GeoFIPS'].replace(
dict(zip(repaired_geofips['damagedFIPS'], repaired_geofips['repairedFIPS']))
)

# deleting short FIPS codes

common_fips = np.intersect1d(gdp['GeoFIPS'].unique(),
spending_commerce['GeoFIPS'].unique())

all_FIPS_spending_commerce = spending_commerce.copy()

spending_commerce = spending_commerce[spending_commerce["GeoFIPS"].isin(common_fips)] # 67 FIPS deleted
assert spending_commerce.shape[0] / all_FIPS_spending_commerce.shape[0] > 0.9, 'Less than 0.9 of FIPS are common!'


# grouping duplicate fips for years
# (they appeared because we have repaired some of them and now they mathch with number that is already present)


spending_commerce = spending_commerce.groupby(['GeoFIPS', 'year'])['total_obligated_amount'].sum().reset_index()
spending_commerce.reset_index(drop=True, inplace=True)


# adding GeoNames
spending_commerce = spending_commerce.merge(gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left")[
["GeoFIPS", "GeoName", "year", "total_obligated_amount"]
]


# standardizing and saving
spending_commerce_long = spending_commerce.copy()

spending_commerce_wide = spending_commerce.pivot_table(index=['GeoFIPS', 'GeoName'], columns='year',
values='total_obligated_amount')
spending_commerce_wide.reset_index(inplace=True)
spending_commerce_wide.columns.name = None
spending_commerce_wide = spending_commerce_wide.fillna(0)

spending_commerce_std_long = standardize_and_scale(spending_commerce)
spending_commerce_std_wide = standardize_and_scale(spending_commerce_wide)


spending_commerce_wide.to_csv("data/processed/spending_commerce_wide.csv", index=False)
spending_commerce_long.to_csv("data/processed/spending_commerce_long.csv", index=False)
spending_commerce_std_wide.to_csv("data/processed/spending_commerce_std_wide.csv", index=False)
spending_commerce_std_long.to_csv("data/processed/spending_commerce_std_long.csv", index=False)


114 changes: 114 additions & 0 deletions cities/utils/clean_spending_transportation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import numpy as np
import pandas as pd

from cities.utils.cleaning_utils import standardize_and_scale
from cities.utils.data_grabber import DataGrabber



def clean_spending_transportation():
data = DataGrabber()
data.get_features_wide(["gdp"])
gdp = data.wide
gdp = gdp.get("gdp")


spending_transportation = pd.read_csv("../data/raw/spending_transportation.csv")

transportUnwanted = spending_transportation[(pd.isna(spending_transportation['total_obligated_amount']) |
(spending_transportation['total_obligated_amount'] == 1) |
(spending_transportation['total_obligated_amount'] == 0))]

exclude_mask = spending_transportation['total_obligated_amount'].isin(transportUnwanted['total_obligated_amount'])
spending_transportation = spending_transportation[~exclude_mask] # 66 values removed

assert spending_transportation.isna().sum().sum() == 0, 'Na values detected'


# loading names and repearing fips of value 3 and shorter

names_transportation = pd.read_csv("../data/raw/spending_transportation_names.csv")

short_geofips = spending_transportation[spending_transportation['GeoFIPS'].astype(str).str.len().between(1, 3)]

spending_only_fips = np.setdiff1d(spending_transportation['GeoFIPS'], gdp['GeoFIPS'])

fips4_to_repeair = [fip for fip in spending_only_fips if (fip < 10000 and fip > 999)]
short4_fips = spending_transportation[spending_transportation['GeoFIPS'].isin(fips4_to_repeair)]

full_geofipsLIST = [fip for fip in spending_only_fips if fip > 9999]
full_geofips = spending_transportation[spending_transportation['GeoFIPS'].isin(full_geofipsLIST)]

cleaningLIST = [full_geofips, short4_fips, short_geofips]


for badFIPS in cleaningLIST:

geofips_to_geonamealt = dict(zip(names_transportation['GeoFIPS'], names_transportation['GeoNameALT']))

badFIPS['GeoNameALT'] = badFIPS['GeoFIPS'].map(geofips_to_geonamealt)
badFIPS = badFIPS.rename(columns={'GeoFIPS': 'damagedFIPS'})

badFIPSmapping_dict = dict(zip(gdp['GeoName'], gdp['GeoFIPS']))

badFIPS['repairedFIPS'] = badFIPS['GeoNameALT'].apply(lambda x: badFIPSmapping_dict.get(x))
repaired_geofips = badFIPS[badFIPS['repairedFIPS'].notna()]


repair_ratio = (repaired_geofips.shape[0] / badFIPS.shape[0])
print(f'Ratio of repaired FIPS: {round(repair_ratio, 2)}')

#assert repair_ratio > 0.9, f'Less than 0.9 of FIPS were successfully repaired!'

spending_transportation['GeoFIPS'] = spending_transportation['GeoFIPS'].replace(
dict(zip(repaired_geofips['damagedFIPS'], repaired_geofips['repairedFIPS']))
)

# deleting short FIPS codes
count_short_geofips = spending_transportation[spending_transportation['GeoFIPS'] <= 999]['GeoFIPS'].count()
assert count_short_geofips / spending_transportation.shape[0] < 0.05, 'More than 0.05 of FIPS are short and will be deleted!'

spending_transportation = spending_transportation[spending_transportation['GeoFIPS'] > 999]


common_fips = np.intersect1d(gdp['GeoFIPS'].unique(),
spending_transportation['GeoFIPS'].unique())

all_FIPS_spending_transportation = spending_transportation.copy()

spending_transportation = spending_transportation[spending_transportation["GeoFIPS"].isin(common_fips)] # 0.96 of FIPS are common
assert spending_transportation.shape[0] / all_FIPS_spending_transportation.shape[0] > 0.9, 'Less than 0.9 of FIPS are common!'


# grouping duplicate fips for years
# (they appeared because we have repaired some of them and now they mathch with number that is already present)


spending_transportation = spending_transportation.groupby(['GeoFIPS', 'year'])['total_obligated_amount'].sum().reset_index()
spending_transportation.reset_index(drop=True, inplace=True)

# adding GeoNames
spending_transportation = spending_transportation.merge(gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left")[
["GeoFIPS", "GeoName", "year", "total_obligated_amount"]
]


# standardizing and saving
spending_transportation_long = spending_transportation.copy()

spending_transportation_wide = spending_transportation.pivot_table(index=['GeoFIPS', 'GeoName'], columns='year',
values='total_obligated_amount')
spending_transportation_wide.reset_index(inplace=True)
spending_transportation_wide.columns.name = None
spending_transportation_wide = spending_transportation_wide.fillna(0)

spending_transportation_std_long = standardize_and_scale(spending_transportation_long)
spending_transportation_std_wide = standardize_and_scale(spending_transportation_wide)


spending_transportation_wide.to_csv("../data/processed/spending_transportation_wide.csv", index=False)
spending_transportation_long.to_csv("../data/processed/spending_transportation_long.csv", index=False)
spending_transportation_std_wide.to_csv("../data/processed/spending_transportation_std_wide.csv", index=False)
spending_transportation_std_long.to_csv("../data/processed/spending_transportation_std_long.csv", index=False)


2 changes: 1 addition & 1 deletion cities/utils/clean_transport.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def clean_transport():
transport = transport[transport["GeoFIPS"].isin(common_fips)]

assert len(common_fips) == len(transport["GeoFIPS"].unique())
assert len(transport) > 2800, "The number of records is lower than 3000"
Niklewa marked this conversation as resolved.
Show resolved Hide resolved
assert len(transport) > 2800, "The number of records is lower than 2800"

# adding geoname column
transport = transport.merge(gdp[["GeoFIPS", "GeoName"]], on="GeoFIPS", how="left")[
Expand Down
9 changes: 9 additions & 0 deletions cities/utils/cleaning_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
from cities.utils.clean_gdp import clean_gdp
from cities.utils.clean_population import clean_population
from cities.utils.clean_transport import clean_transport
from cities.utils.clean_spending_transportation import clean_spending_transportation
from cities.utils.clean_spending_commerce import clean_spending_commerce
from cities.utils.clean_spending_HHS import clean_spending_HHS

clean_gdp()

clean_population()

clean_transport()

clean_spending_transportation()

clean_spending_commerce()

clean_spending_HHS()
Loading