-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #5 from BasisResearch/ru-add-gdp
Added gdp data and the minimal working version
- Loading branch information
Showing
33 changed files
with
143,289 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,20 @@ | ||
venv | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
*.pyc | ||
**/*.pyc | ||
tests/__pycache__/ | ||
tests/__pycache__/test_data_grabber.cpython-310-pytest-7.2.0.pyc | ||
tests/__pycache__/test_cleaning_utils.cpython-310-pytest-7.2.0.pyc | ||
.vscode/settings.json | ||
cities/utils/__pycache__/__init__.cpython-310.pyc | ||
cities/utils/__pycache__/__init__.cpython-310.pyc | ||
cities/utils/__pycache__/cleaning_utils.cpython-310.pyc | ||
cities/utils/__pycache__/data_grabber.cpython-310.pyc | ||
tests/__pycache__/test_cleaning_utils.cpython-310-pytest-7.2.0.pyc | ||
tests/__pycache__/test_data_grabber.cpython-310-pytest-7.2.0.pyc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"[python]": { | ||
"editor.defaultFormatter": "ms-python.black-formatter" | ||
}, | ||
"python.formatting.provider": "none" | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
from .cleaning_utils import standardize_and_scale, find_repo_root | ||
from .data_grabber import DataGrabber | ||
from .fips_query import FipsQuery | ||
from .similarity_utils import slice_with_lag |
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
import sys | ||
import os | ||
|
||
from cleaning_utils import find_repo_root | ||
sys.path.insert(0, find_repo_root()) | ||
|
||
from cleaning_utils import clean_gdp | ||
|
||
|
||
|
||
clean_gdp() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
import numpy as np | ||
import pandas as pd | ||
from sklearn.preprocessing import StandardScaler | ||
import os | ||
import sys | ||
|
||
|
||
def find_repo_root(): | ||
""" | ||
Finds the repo root (fodler containing .gitignore) and adds it to sys.path. | ||
""" | ||
current_dir = os.getcwd() | ||
while True: | ||
marker_file_path = os.path.join(current_dir, '.gitignore') | ||
if os.path.isfile(marker_file_path): | ||
return current_dir | ||
|
||
parent_dir = os.path.dirname(current_dir) | ||
if parent_dir == current_dir: | ||
break | ||
current_dir = parent_dir | ||
return current_dir | ||
|
||
|
||
|
||
|
||
def standardize_and_scale(data: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Standardizes and scales float columns in a DataFrame to [-1,1], copying other columns. Returns a new DataFrame. | ||
""" | ||
standard_scaler = StandardScaler() | ||
|
||
new_data = pd.DataFrame() | ||
for column in data.columns: | ||
if data.dtypes[column] != 'float64': | ||
new_data[column] = data[column].copy() | ||
else: | ||
new = data[column].copy().values.reshape(-1, 1) | ||
new = standard_scaler.fit_transform(new) | ||
|
||
|
||
positive_mask = new >= 0 | ||
negative_mask = new < 0 | ||
|
||
min_positive = np.min(new[positive_mask]) | ||
max_positive = np.max(new[positive_mask]) | ||
scaled_positive = (new[positive_mask] - min_positive) / (max_positive - min_positive) | ||
|
||
min_negative = np.min(new[negative_mask]) | ||
max_negative = np.max(new[negative_mask]) | ||
scaled_negative = (new[negative_mask] - min_negative) / (max_negative - min_negative) - 1 | ||
|
||
scaled_values = np.empty_like(new, dtype=float) | ||
scaled_values[positive_mask] = scaled_positive | ||
scaled_values[negative_mask] = scaled_negative | ||
|
||
|
||
new_data[column] = scaled_values.reshape(-1) | ||
|
||
|
||
return new_data | ||
|
||
|
||
def clean_gdp(): | ||
gdp = pd.read_csv("data/raw/CAGDP1_2001_2021.csv", encoding='ISO-8859-1') | ||
|
||
gdp = gdp.loc[:9533] #drop notes at the bottom | ||
|
||
gdp['GeoFIPS'] = gdp['GeoFIPS'].fillna('').astype(str) | ||
gdp['GeoFIPS'] = gdp['GeoFIPS'].str.strip(' "').astype(int) | ||
|
||
|
||
#remove large regions | ||
gdp = gdp[gdp['GeoFIPS'] % 100 != 0] | ||
|
||
# focus on chain-type GDP | ||
mask = gdp['Description'].str.startswith('Chain') | ||
gdp = gdp[mask] | ||
|
||
#drop Region number, Tablename, LineCode, IndustryClassification columns (the last one is empty anyway) | ||
gdp = gdp.drop(gdp.columns[2:8], axis=1) | ||
|
||
#2012 makes no sense, it's 100 throughout | ||
gdp = gdp.drop('2012', axis=1) | ||
|
||
gdp.replace('(NA)', np.nan, inplace=True) | ||
gdp.replace('(NM)', np.nan, inplace=True) | ||
|
||
|
||
#nan_rows = gdp[gdp.isna().any(axis=1)] # if inspection is needed | ||
|
||
gdp.dropna(axis=0, inplace=True) | ||
|
||
for column in gdp.columns[2:]: | ||
gdp[column] = gdp[column].astype(float) | ||
|
||
|
||
assert gdp['GeoName'].is_unique | ||
|
||
for column in gdp.columns[2:]: | ||
assert (gdp[column] > 0).all(), f"Negative values in {column}" | ||
assert (gdp[column].isna().sum() == 0), f"Missing values in {column}" | ||
assert (gdp[column].isnull().sum() == 0), f"Null values in {column}" | ||
assert (gdp[column] < 3000).all(), f"Values suspiciously large in {column}" | ||
|
||
#TODO_Nikodem investigate strange large values | ||
|
||
gdp_wide = gdp.copy() | ||
gdp_long = pd.melt(gdp.copy(), id_vars=['GeoFIPS', 'GeoName'], | ||
var_name='Year', | ||
value_name='Value') | ||
|
||
|
||
gdp_std_wide = standardize_and_scale(gdp) | ||
gdp_std_long = pd.melt(gdp_std_wide.copy(), id_vars=['GeoFIPS', 'GeoName'], | ||
var_name='Year', | ||
value_name='Value') | ||
|
||
gdp_wide.to_csv("data/processed/gdp_wide.csv", index=False) | ||
gdp_long.to_csv("data/processed/gdp_long.csv", index=False) | ||
gdp_std_wide.to_csv("data/processed/gdp_std_wide.csv", index=False) | ||
gdp_std_long.to_csv("data/processed/gdp_std_long.csv", index=False) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import os | ||
import sys | ||
import pandas as pd | ||
|
||
|
||
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) | ||
sys.path.insert(0, parent_dir) | ||
|
||
|
||
from cities.utils.cleaning_utils import find_repo_root | ||
|
||
|
||
class DataGrabber: | ||
def __init__(self): | ||
self.repo_root = find_repo_root() | ||
sys.path.insert(0, self.repo_root) | ||
|
||
|
||
def get_gdp_wide(self): | ||
file_path = os.path.join(self.repo_root, "data/processed/gdp_wide.csv") | ||
self.gdp_wide = pd.read_csv(file_path) | ||
|
||
def get_gdp_std_wide(self): | ||
file_path = os.path.join(self.repo_root, "data/processed/gdp_std_wide.csv") | ||
self.gdp_std_wide = pd.read_csv(file_path) | ||
|
||
def get_gdp_long(self): | ||
file_path = os.path.join(self.repo_root, "data/processed/gdp_long.csv") | ||
self.gdp_long = pd.read_csv(file_path) | ||
|
||
def get_gdp_std_long(self): | ||
file_path = os.path.join(self.repo_root, "data/processed/gdp_std_long.csv") | ||
self.gdp_std_long = pd.read_csv(file_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
import os | ||
import sys | ||
|
||
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir)) | ||
sys.path.insert(0, parent_dir) | ||
|
||
|
||
import pandas as pd | ||
import numpy as np | ||
|
||
import plotly.graph_objects as go | ||
from scipy.spatial import distance | ||
from cities.utils.data_grabber import DataGrabber | ||
from cities.utils.similarity_utils import slice_with_lag | ||
|
||
|
||
class FipsQuery: | ||
|
||
def __init__(self, fips, outcome_var = "gdp", feature_groups = [], weights = None, lag = 0, top = 5): | ||
|
||
#TODO add weights rescaling to init | ||
#TODO with a non-trival example of feature groups | ||
|
||
assert outcome_var in ["gdp"], "outcome_var must be one of ['gdp']" #TODO expand to other outcome vars | ||
|
||
self.data = DataGrabber() | ||
self.repo_root = self.data.repo_root | ||
self.fips = fips | ||
self.lag = lag | ||
self.top = top | ||
self.outcome_var = outcome_var | ||
self.weights = weights | ||
|
||
self.data.get_gdp_std_wide() | ||
self.name = self.data.gdp_std_wide['GeoName'][self.data.gdp_std_wide['GeoFIPS'] == self.fips].values[0] | ||
|
||
assert self.lag >= 0 and self.lag < 6 and isinstance(self.lag, int), "lag must be an iteger between 0 and 5" | ||
assert (self.top > 0 and isinstance(self.top, int) and | ||
self.top < self.data.gdp_std_wide.shape[0]), ( | ||
"top must be a positive integer smaller than the number of locations in the dataset" | ||
) | ||
|
||
|
||
def find_euclidean_kins(self): ##TODO_Nikodem add a test for this function | ||
|
||
|
||
if self.outcome_var == "gdp": | ||
slices = slice_with_lag(self.data.gdp_std_wide, self.fips, self.lag) | ||
|
||
self.my_array = np.array(slices['my_array']) | ||
self.other_arrays = np.array(slices['other_arrays']) | ||
self.other_df = slices['other_df'] | ||
|
||
#TODO add other features here | ||
#TODO will need to have the same fips codes in the same order in all other datasets | ||
#TODO for the feature addition to be introduced here to work smoothly | ||
|
||
distances = [] | ||
for vector in self.other_arrays: | ||
distances.append(distance.euclidean(self.my_array, vector, w = self.weights)) | ||
|
||
assert len(distances) == self.other_arrays.shape[0], "Something went wrong" | ||
|
||
self.other_df[f'distance to {self.fips}'] = distances | ||
|
||
self.euclidean_kins = self.other_df.sort_values(by=self.other_df.columns[-1]) | ||
#TODO_Nikodem make sure this returns df with the original variable values, prior to normalization and rescaling | ||
|
||
|
||
def plot_kins(self): | ||
if self.outcome_var == "gdp": | ||
self.data.get_gdp_long() | ||
my_outcomes_long = self.data.gdp_long[self.data.gdp_long['GeoFIPS'] == self.fips].copy() | ||
|
||
fips_top = self.euclidean_kins['GeoFIPS'].iloc[:self.top].values | ||
|
||
others_outcome_long = self.data.gdp_long[self.data.gdp_long['GeoFIPS'].isin(fips_top)] | ||
|
||
|
||
fig = go.Figure() | ||
fig.add_trace(go.Scatter(x=my_outcomes_long['Year'], y=my_outcomes_long['Value'], | ||
mode='lines', name=my_outcomes_long['GeoName'].iloc[0], | ||
line=dict(color='darkred', width=3), | ||
text=my_outcomes_long['GeoName'].iloc[0], | ||
textposition='top right' | ||
)) | ||
|
||
#TODO_Nikodem add more shades and test on various settings of top | ||
shades_of_grey = ['#333333', '#444444', '#555555', '#666666', '#777777'][:self.top] | ||
pastel_colors = ['#FFC0CB', '#A9A9A9', '#87CEFA', '#FFD700', '#98FB98'][:self.top] | ||
|
||
#R: not sure which look better | ||
|
||
for i, geoname in enumerate(others_outcome_long['GeoName'].unique()): | ||
subset = others_outcome_long[others_outcome_long['GeoName'] == geoname] | ||
#line_color = shades_of_grey[i % len(shades_of_grey)] | ||
line_color = pastel_colors[i % len(pastel_colors)] | ||
fig.add_trace(go.Scatter(x=subset['Year'] + self.lag, y=subset['Value'], | ||
mode='lines', name=subset['GeoName'].iloc[0], | ||
line_color=line_color, | ||
text=subset['GeoName'].iloc[0], | ||
textposition='top right' | ||
)) | ||
|
||
if self.lag >0: | ||
fig.update_layout( | ||
shapes=[ | ||
dict( | ||
type='line', | ||
x0=2021, | ||
x1=2021, | ||
y0=0, | ||
y1=1, | ||
xref='x', | ||
yref='paper', | ||
line=dict(color='darkgray', width=2) | ||
) | ||
] | ||
) | ||
|
||
fig.add_annotation( | ||
text=f'their year {2021 - self.lag}', | ||
x=2021., | ||
y=1.05, | ||
xref='x', | ||
yref='paper', | ||
showarrow=False, | ||
font=dict(color='darkgray') | ||
) | ||
|
||
|
||
|
||
fig.update_layout( | ||
title=f'Top {self.top} locations whose GDP patterns up to year {2021-self.lag} are most similar to the current pattern of {self.name}', | ||
xaxis_title='Year', | ||
yaxis_title='Chain-type quantity indexes for real GDP', | ||
legend=dict(title='GeoName'), | ||
template = "simple_white", | ||
) | ||
|
||
fig.show() | ||
|
||
|
||
#TODO_Nikodem add population clustering and warning if a population is much different, | ||
#especially if small | ||
|
||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
|
||
from typing import Dict, List, Union | ||
import numpy as np | ||
import pandas as pd | ||
|
||
|
||
def slice_with_lag(df: pd.DataFrame, fips: int, lag: int) -> Dict[str, np.ndarray]: | ||
""" | ||
Takes a pandas dataframe, a location FIPS and a lag (years), | ||
returns a dictionary with two numpy arrays: | ||
- my_array: the array of features for the location with the given FIPS | ||
- other_arrays: the array of features for all other locations | ||
if lag>0, drops first lag columns from my_array and last lag columns from other_arrays. | ||
Meant to be used prior to calculating similarity. | ||
""" | ||
original_length = df.shape[0] | ||
|
||
# this assumes input df has two columns of metadata, then the rest are features | ||
# obey this convention with other datasets! | ||
my_array = np.array(df[df['GeoFIPS'] == fips].values[0][2+lag:].copy()) | ||
other_df = df[df['GeoFIPS'] != fips].copy() | ||
|
||
if lag >0: | ||
other_df_cut = other_df.iloc[:, 2:-lag] | ||
other_arrays = np.array(other_df_cut.values) | ||
else: | ||
other_df_cut = other_df.iloc[:, 2:] | ||
other_arrays = np.array(other_df_cut.values) | ||
|
||
assert other_arrays.shape[0] + 1 == original_length, "Dataset sizes don't match" | ||
assert other_arrays.shape[1] == my_array.shape[0], "Lengths don't match" | ||
|
||
return {'my_array': my_array, 'other_arrays': other_arrays, 'other_df': other_df} |
Empty file.
Binary file not shown.
Oops, something went wrong.