Skip to content

Commit

Permalink
Merge pull request #5 from BasisResearch/ru-add-gdp
Browse files Browse the repository at this point in the history
Added gdp data and the minimal working version
  • Loading branch information
riadas authored Oct 13, 2023
2 parents 30720c5 + 0cb588d commit 428e04d
Show file tree
Hide file tree
Showing 33 changed files with 143,289 additions and 1 deletion.
2 changes: 1 addition & 1 deletion .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
python -m pytest tests/
cd tests && python -m pytest && cd ..
19 changes: 19 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,20 @@
venv







*.pyc
**/*.pyc
tests/__pycache__/
tests/__pycache__/test_data_grabber.cpython-310-pytest-7.2.0.pyc
tests/__pycache__/test_cleaning_utils.cpython-310-pytest-7.2.0.pyc
.vscode/settings.json
cities/utils/__pycache__/__init__.cpython-310.pyc
cities/utils/__pycache__/__init__.cpython-310.pyc
cities/utils/__pycache__/cleaning_utils.cpython-310.pyc
cities/utils/__pycache__/data_grabber.cpython-310.pyc
tests/__pycache__/test_cleaning_utils.cpython-310-pytest-7.2.0.pyc
tests/__pycache__/test_data_grabber.cpython-310-pytest-7.2.0.pyc
6 changes: 6 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter"
},
"python.formatting.provider": "none"
}
Binary file added cities/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
4 changes: 4 additions & 0 deletions cities/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .cleaning_utils import standardize_and_scale, find_repo_root
from .data_grabber import DataGrabber
from .fips_query import FipsQuery
from .similarity_utils import slice_with_lag
Binary file added cities/utils/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
11 changes: 11 additions & 0 deletions cities/utils/cleaning_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import sys
import os

from cleaning_utils import find_repo_root
sys.path.insert(0, find_repo_root())

from cleaning_utils import clean_gdp



clean_gdp()
124 changes: 124 additions & 0 deletions cities/utils/cleaning_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import os
import sys


def find_repo_root():
"""
Finds the repo root (fodler containing .gitignore) and adds it to sys.path.
"""
current_dir = os.getcwd()
while True:
marker_file_path = os.path.join(current_dir, '.gitignore')
if os.path.isfile(marker_file_path):
return current_dir

parent_dir = os.path.dirname(current_dir)
if parent_dir == current_dir:
break
current_dir = parent_dir
return current_dir




def standardize_and_scale(data: pd.DataFrame) -> pd.DataFrame:
"""
Standardizes and scales float columns in a DataFrame to [-1,1], copying other columns. Returns a new DataFrame.
"""
standard_scaler = StandardScaler()

new_data = pd.DataFrame()
for column in data.columns:
if data.dtypes[column] != 'float64':
new_data[column] = data[column].copy()
else:
new = data[column].copy().values.reshape(-1, 1)
new = standard_scaler.fit_transform(new)


positive_mask = new >= 0
negative_mask = new < 0

min_positive = np.min(new[positive_mask])
max_positive = np.max(new[positive_mask])
scaled_positive = (new[positive_mask] - min_positive) / (max_positive - min_positive)

min_negative = np.min(new[negative_mask])
max_negative = np.max(new[negative_mask])
scaled_negative = (new[negative_mask] - min_negative) / (max_negative - min_negative) - 1

scaled_values = np.empty_like(new, dtype=float)
scaled_values[positive_mask] = scaled_positive
scaled_values[negative_mask] = scaled_negative


new_data[column] = scaled_values.reshape(-1)


return new_data


def clean_gdp():
gdp = pd.read_csv("data/raw/CAGDP1_2001_2021.csv", encoding='ISO-8859-1')

gdp = gdp.loc[:9533] #drop notes at the bottom

gdp['GeoFIPS'] = gdp['GeoFIPS'].fillna('').astype(str)
gdp['GeoFIPS'] = gdp['GeoFIPS'].str.strip(' "').astype(int)


#remove large regions
gdp = gdp[gdp['GeoFIPS'] % 100 != 0]

# focus on chain-type GDP
mask = gdp['Description'].str.startswith('Chain')
gdp = gdp[mask]

#drop Region number, Tablename, LineCode, IndustryClassification columns (the last one is empty anyway)
gdp = gdp.drop(gdp.columns[2:8], axis=1)

#2012 makes no sense, it's 100 throughout
gdp = gdp.drop('2012', axis=1)

gdp.replace('(NA)', np.nan, inplace=True)
gdp.replace('(NM)', np.nan, inplace=True)


#nan_rows = gdp[gdp.isna().any(axis=1)] # if inspection is needed

gdp.dropna(axis=0, inplace=True)

for column in gdp.columns[2:]:
gdp[column] = gdp[column].astype(float)


assert gdp['GeoName'].is_unique

for column in gdp.columns[2:]:
assert (gdp[column] > 0).all(), f"Negative values in {column}"
assert (gdp[column].isna().sum() == 0), f"Missing values in {column}"
assert (gdp[column].isnull().sum() == 0), f"Null values in {column}"
assert (gdp[column] < 3000).all(), f"Values suspiciously large in {column}"

#TODO_Nikodem investigate strange large values

gdp_wide = gdp.copy()
gdp_long = pd.melt(gdp.copy(), id_vars=['GeoFIPS', 'GeoName'],
var_name='Year',
value_name='Value')


gdp_std_wide = standardize_and_scale(gdp)
gdp_std_long = pd.melt(gdp_std_wide.copy(), id_vars=['GeoFIPS', 'GeoName'],
var_name='Year',
value_name='Value')

gdp_wide.to_csv("data/processed/gdp_wide.csv", index=False)
gdp_long.to_csv("data/processed/gdp_long.csv", index=False)
gdp_std_wide.to_csv("data/processed/gdp_std_wide.csv", index=False)
gdp_std_long.to_csv("data/processed/gdp_std_long.csv", index=False)


33 changes: 33 additions & 0 deletions cities/utils/data_grabber.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
import sys
import pandas as pd


parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.insert(0, parent_dir)


from cities.utils.cleaning_utils import find_repo_root


class DataGrabber:
def __init__(self):
self.repo_root = find_repo_root()
sys.path.insert(0, self.repo_root)


def get_gdp_wide(self):
file_path = os.path.join(self.repo_root, "data/processed/gdp_wide.csv")
self.gdp_wide = pd.read_csv(file_path)

def get_gdp_std_wide(self):
file_path = os.path.join(self.repo_root, "data/processed/gdp_std_wide.csv")
self.gdp_std_wide = pd.read_csv(file_path)

def get_gdp_long(self):
file_path = os.path.join(self.repo_root, "data/processed/gdp_long.csv")
self.gdp_long = pd.read_csv(file_path)

def get_gdp_std_long(self):
file_path = os.path.join(self.repo_root, "data/processed/gdp_std_long.csv")
self.gdp_std_long = pd.read_csv(file_path)
151 changes: 151 additions & 0 deletions cities/utils/fips_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
import os
import sys

parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.insert(0, parent_dir)


import pandas as pd
import numpy as np

import plotly.graph_objects as go
from scipy.spatial import distance
from cities.utils.data_grabber import DataGrabber
from cities.utils.similarity_utils import slice_with_lag


class FipsQuery:

def __init__(self, fips, outcome_var = "gdp", feature_groups = [], weights = None, lag = 0, top = 5):

#TODO add weights rescaling to init
#TODO with a non-trival example of feature groups

assert outcome_var in ["gdp"], "outcome_var must be one of ['gdp']" #TODO expand to other outcome vars

self.data = DataGrabber()
self.repo_root = self.data.repo_root
self.fips = fips
self.lag = lag
self.top = top
self.outcome_var = outcome_var
self.weights = weights

self.data.get_gdp_std_wide()
self.name = self.data.gdp_std_wide['GeoName'][self.data.gdp_std_wide['GeoFIPS'] == self.fips].values[0]

assert self.lag >= 0 and self.lag < 6 and isinstance(self.lag, int), "lag must be an iteger between 0 and 5"
assert (self.top > 0 and isinstance(self.top, int) and
self.top < self.data.gdp_std_wide.shape[0]), (
"top must be a positive integer smaller than the number of locations in the dataset"
)


def find_euclidean_kins(self): ##TODO_Nikodem add a test for this function


if self.outcome_var == "gdp":
slices = slice_with_lag(self.data.gdp_std_wide, self.fips, self.lag)

self.my_array = np.array(slices['my_array'])
self.other_arrays = np.array(slices['other_arrays'])
self.other_df = slices['other_df']

#TODO add other features here
#TODO will need to have the same fips codes in the same order in all other datasets
#TODO for the feature addition to be introduced here to work smoothly

distances = []
for vector in self.other_arrays:
distances.append(distance.euclidean(self.my_array, vector, w = self.weights))

assert len(distances) == self.other_arrays.shape[0], "Something went wrong"

self.other_df[f'distance to {self.fips}'] = distances

self.euclidean_kins = self.other_df.sort_values(by=self.other_df.columns[-1])
#TODO_Nikodem make sure this returns df with the original variable values, prior to normalization and rescaling


def plot_kins(self):
if self.outcome_var == "gdp":
self.data.get_gdp_long()
my_outcomes_long = self.data.gdp_long[self.data.gdp_long['GeoFIPS'] == self.fips].copy()

fips_top = self.euclidean_kins['GeoFIPS'].iloc[:self.top].values

others_outcome_long = self.data.gdp_long[self.data.gdp_long['GeoFIPS'].isin(fips_top)]


fig = go.Figure()
fig.add_trace(go.Scatter(x=my_outcomes_long['Year'], y=my_outcomes_long['Value'],
mode='lines', name=my_outcomes_long['GeoName'].iloc[0],
line=dict(color='darkred', width=3),
text=my_outcomes_long['GeoName'].iloc[0],
textposition='top right'
))

#TODO_Nikodem add more shades and test on various settings of top
shades_of_grey = ['#333333', '#444444', '#555555', '#666666', '#777777'][:self.top]
pastel_colors = ['#FFC0CB', '#A9A9A9', '#87CEFA', '#FFD700', '#98FB98'][:self.top]

#R: not sure which look better

for i, geoname in enumerate(others_outcome_long['GeoName'].unique()):
subset = others_outcome_long[others_outcome_long['GeoName'] == geoname]
#line_color = shades_of_grey[i % len(shades_of_grey)]
line_color = pastel_colors[i % len(pastel_colors)]
fig.add_trace(go.Scatter(x=subset['Year'] + self.lag, y=subset['Value'],
mode='lines', name=subset['GeoName'].iloc[0],
line_color=line_color,
text=subset['GeoName'].iloc[0],
textposition='top right'
))

if self.lag >0:
fig.update_layout(
shapes=[
dict(
type='line',
x0=2021,
x1=2021,
y0=0,
y1=1,
xref='x',
yref='paper',
line=dict(color='darkgray', width=2)
)
]
)

fig.add_annotation(
text=f'their year {2021 - self.lag}',
x=2021.,
y=1.05,
xref='x',
yref='paper',
showarrow=False,
font=dict(color='darkgray')
)



fig.update_layout(
title=f'Top {self.top} locations whose GDP patterns up to year {2021-self.lag} are most similar to the current pattern of {self.name}',
xaxis_title='Year',
yaxis_title='Chain-type quantity indexes for real GDP',
legend=dict(title='GeoName'),
template = "simple_white",
)

fig.show()


#TODO_Nikodem add population clustering and warning if a population is much different,
#especially if small






33 changes: 33 additions & 0 deletions cities/utils/similarity_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@

from typing import Dict, List, Union
import numpy as np
import pandas as pd


def slice_with_lag(df: pd.DataFrame, fips: int, lag: int) -> Dict[str, np.ndarray]:
"""
Takes a pandas dataframe, a location FIPS and a lag (years),
returns a dictionary with two numpy arrays:
- my_array: the array of features for the location with the given FIPS
- other_arrays: the array of features for all other locations
if lag>0, drops first lag columns from my_array and last lag columns from other_arrays.
Meant to be used prior to calculating similarity.
"""
original_length = df.shape[0]

# this assumes input df has two columns of metadata, then the rest are features
# obey this convention with other datasets!
my_array = np.array(df[df['GeoFIPS'] == fips].values[0][2+lag:].copy())
other_df = df[df['GeoFIPS'] != fips].copy()

if lag >0:
other_df_cut = other_df.iloc[:, 2:-lag]
other_arrays = np.array(other_df_cut.values)
else:
other_df_cut = other_df.iloc[:, 2:]
other_arrays = np.array(other_df_cut.values)

assert other_arrays.shape[0] + 1 == original_length, "Dataset sizes don't match"
assert other_arrays.shape[1] == my_array.shape[0], "Lengths don't match"

return {'my_array': my_array, 'other_arrays': other_arrays, 'other_df': other_df}
Empty file added data/processed/.gitkeep
Empty file.
Binary file added data/processed/.xdp-.~gdp_std_wide.csv-KVPFNt
Binary file not shown.
Loading

0 comments on commit 428e04d

Please sign in to comment.