Merge pull request #5 from BasisResearch/ru-add-gdp

Added gdp data and the minimal working version
BasisResearch · Oct 13, 2023 · 428e04d · 428e04d
2 parents 30720c5 + 0cb588d
commit 428e04d
Show file tree

Hide file tree

Showing 33 changed files with 143,289 additions and 1 deletion.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -36,4 +36,4 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with pytest
       run: |
-        python -m pytest tests/
+        cd tests && python -m pytest && cd ..
diff --git a/.gitignore b/.gitignore
@@ -1 +1,20 @@
 venv
+
+
+
+
+
+
+
+*.pyc
+**/*.pyc 
+tests/__pycache__/
+tests/__pycache__/test_data_grabber.cpython-310-pytest-7.2.0.pyc
+tests/__pycache__/test_cleaning_utils.cpython-310-pytest-7.2.0.pyc
+.vscode/settings.json
+cities/utils/__pycache__/__init__.cpython-310.pyc
+cities/utils/__pycache__/__init__.cpython-310.pyc
+cities/utils/__pycache__/cleaning_utils.cpython-310.pyc
+cities/utils/__pycache__/data_grabber.cpython-310.pyc
+tests/__pycache__/test_cleaning_utils.cpython-310-pytest-7.2.0.pyc
+tests/__pycache__/test_data_grabber.cpython-310-pytest-7.2.0.pyc
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "[python]": {
+        "editor.defaultFormatter": "ms-python.black-formatter"
+    },
+    "python.formatting.provider": "none"
+}
diff --git a/cities/__pycache__/__init__.cpython-310.pyc b/cities/__pycache__/__init__.cpython-310.pyc
diff --git a/cities/utils/__init__.py b/cities/utils/__init__.py
@@ -0,0 +1,4 @@
+from .cleaning_utils import standardize_and_scale, find_repo_root
+from .data_grabber import DataGrabber
+from .fips_query import FipsQuery
+from .similarity_utils import slice_with_lag
diff --git a/cities/utils/__pycache__/__init__.cpython-310.pyc b/cities/utils/__pycache__/__init__.cpython-310.pyc
diff --git a/cities/utils/__pycache__/cleaning_utils.cpython-310.pyc b/cities/utils/__pycache__/cleaning_utils.cpython-310.pyc
diff --git a/cities/utils/__pycache__/data_grabber.cpython-310.pyc b/cities/utils/__pycache__/data_grabber.cpython-310.pyc
diff --git a/cities/utils/cleaning_pipeline.py b/cities/utils/cleaning_pipeline.py
@@ -0,0 +1,11 @@
+import sys
+import os
+
+from cleaning_utils import find_repo_root
+sys.path.insert(0, find_repo_root())
+
+from cleaning_utils import clean_gdp
+
+
+
+clean_gdp()
diff --git a/cities/utils/cleaning_utils.py b/cities/utils/cleaning_utils.py
@@ -0,0 +1,124 @@
+import numpy as np
+import pandas as pd
+from sklearn.preprocessing import StandardScaler
+import os
+import sys
+
+
+def find_repo_root():
+    """
+    Finds the repo root (fodler containing .gitignore) and adds it to sys.path.
+    """
+    current_dir = os.getcwd()
+    while True:
+        marker_file_path = os.path.join(current_dir, '.gitignore') 
+        if os.path.isfile(marker_file_path):
+            return current_dir 
+
+        parent_dir = os.path.dirname(current_dir)
+        if parent_dir == current_dir:
+            break
+        current_dir = parent_dir
+    return current_dir
+
+
+
+
+def standardize_and_scale(data: pd.DataFrame) -> pd.DataFrame:
+    """
+    Standardizes and scales float columns in a DataFrame to [-1,1], copying other columns. Returns a new DataFrame.
+    """
+    standard_scaler = StandardScaler()
+
+    new_data = pd.DataFrame()
+    for column in data.columns:
+        if data.dtypes[column] != 'float64':
+            new_data[column] = data[column].copy()
+        else:
+            new = data[column].copy().values.reshape(-1, 1)
+            new = standard_scaler.fit_transform(new)
+
+
+            positive_mask = new >= 0
+            negative_mask = new < 0
+
+            min_positive = np.min(new[positive_mask])
+            max_positive = np.max(new[positive_mask])
+            scaled_positive = (new[positive_mask] - min_positive) / (max_positive - min_positive)
+
+            min_negative = np.min(new[negative_mask])
+            max_negative = np.max(new[negative_mask])
+            scaled_negative = (new[negative_mask] - min_negative) / (max_negative - min_negative) - 1
+
+            scaled_values = np.empty_like(new, dtype=float)
+            scaled_values[positive_mask] = scaled_positive
+            scaled_values[negative_mask] = scaled_negative
+
+
+            new_data[column] =  scaled_values.reshape(-1)
+
+
+    return new_data
+
+
+def clean_gdp():
+    gdp = pd.read_csv("data/raw/CAGDP1_2001_2021.csv", encoding='ISO-8859-1')
+
+    gdp = gdp.loc[:9533] #drop notes at the bottom
+
+    gdp['GeoFIPS'] = gdp['GeoFIPS'].fillna('').astype(str)
+    gdp['GeoFIPS'] = gdp['GeoFIPS'].str.strip(' "').astype(int)
+
+
+    #remove large regions
+    gdp = gdp[gdp['GeoFIPS'] % 100 != 0]
+
+    # focus on chain-type GDP
+    mask = gdp['Description'].str.startswith('Chain')
+    gdp = gdp[mask]
+
+    #drop Region number, Tablename, LineCode, IndustryClassification columns (the last one is empty anyway)
+    gdp = gdp.drop(gdp.columns[2:8], axis=1) 
+
+    #2012 makes no sense, it's 100 throughout
+    gdp = gdp.drop('2012', axis=1)
+
+    gdp.replace('(NA)', np.nan, inplace=True)
+    gdp.replace('(NM)', np.nan, inplace=True)
+
+
+    #nan_rows = gdp[gdp.isna().any(axis=1)] #  if inspection is needed
+
+    gdp.dropna(axis=0, inplace=True)
+
+    for column in gdp.columns[2:]:
+        gdp[column] = gdp[column].astype(float)
+
+
+    assert gdp['GeoName'].is_unique
+
+    for column in gdp.columns[2:]:
+        assert (gdp[column] > 0).all(), f"Negative values in {column}"
+        assert (gdp[column].isna().sum() == 0), f"Missing values in {column}"
+        assert (gdp[column].isnull().sum() == 0), f"Null values in {column}"
+        assert (gdp[column] < 3000).all(), f"Values suspiciously large in {column}"
+
+    #TODO_Nikodem investigate strange large values
+
+    gdp_wide = gdp.copy()
+    gdp_long = pd.melt(gdp.copy(),  id_vars=['GeoFIPS', 'GeoName'],
+    var_name='Year',
+    value_name='Value')
+
+
+    gdp_std_wide = standardize_and_scale(gdp)
+    gdp_std_long = pd.melt(gdp_std_wide.copy(),  id_vars=['GeoFIPS', 'GeoName'],
+                    var_name='Year', 
+                    value_name='Value')
+
+    gdp_wide.to_csv("data/processed/gdp_wide.csv", index=False)
+    gdp_long.to_csv("data/processed/gdp_long.csv", index=False)
+    gdp_std_wide.to_csv("data/processed/gdp_std_wide.csv", index=False)
+    gdp_std_long.to_csv("data/processed/gdp_std_long.csv", index=False)
+
+
diff --git a/cities/utils/data_grabber.py b/cities/utils/data_grabber.py
@@ -0,0 +1,33 @@
+import os
+import sys
+import pandas as pd
+
+
+parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))  
+sys.path.insert(0, parent_dir)
+
+
+from cities.utils.cleaning_utils import find_repo_root
+
+
+class DataGrabber:
+    def __init__(self):
+        self.repo_root = find_repo_root()
+        sys.path.insert(0, self.repo_root)
+
+
+    def get_gdp_wide(self):
+        file_path = os.path.join(self.repo_root, "data/processed/gdp_wide.csv")
+        self.gdp_wide = pd.read_csv(file_path)
+
+    def get_gdp_std_wide(self):
+        file_path = os.path.join(self.repo_root, "data/processed/gdp_std_wide.csv")
+        self.gdp_std_wide = pd.read_csv(file_path)
+
+    def get_gdp_long(self):
+        file_path = os.path.join(self.repo_root, "data/processed/gdp_long.csv")
+        self.gdp_long = pd.read_csv(file_path)
+
+    def get_gdp_std_long(self):
+        file_path = os.path.join(self.repo_root, "data/processed/gdp_std_long.csv")
+        self.gdp_std_long = pd.read_csv(file_path) 
diff --git a/cities/utils/fips_query.py b/cities/utils/fips_query.py
@@ -0,0 +1,151 @@
+import os
+import sys
+
+parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))  
+sys.path.insert(0, parent_dir)
+
+
+import pandas as pd
+import numpy as np
+
+import plotly.graph_objects as go
+from scipy.spatial import distance
+from cities.utils.data_grabber import DataGrabber
+from cities.utils.similarity_utils import slice_with_lag
+
+
+class FipsQuery:
+
+    def __init__(self, fips, outcome_var = "gdp", feature_groups = [], weights = None, lag = 0, top = 5): 
+
+        #TODO add weights rescaling to init
+        #TODO with a non-trival example of feature groups
+
+        assert outcome_var in ["gdp"], "outcome_var must be one of ['gdp']" #TODO expand to other outcome vars
+
+        self.data = DataGrabber()
+        self.repo_root = self.data.repo_root
+        self.fips = fips
+        self.lag = lag
+        self.top = top
+        self.outcome_var = outcome_var
+        self.weights = weights
+
+        self.data.get_gdp_std_wide()
+        self.name = self.data.gdp_std_wide['GeoName'][self.data.gdp_std_wide['GeoFIPS'] == self.fips].values[0]
+
+        assert self.lag >= 0 and self.lag < 6 and  isinstance(self.lag, int),  "lag must be  an iteger between 0 and 5"
+        assert (self.top > 0 and isinstance(self.top, int) and 
+                    self.top < self.data.gdp_std_wide.shape[0]), (
+                "top must be a positive integer smaller than the number of locations in the dataset"
+                    )
+
+
+    def find_euclidean_kins(self): ##TODO_Nikodem add a test for this function
+
+
+        if self.outcome_var == "gdp":
+            slices = slice_with_lag(self.data.gdp_std_wide, self.fips, self.lag)
+
+        self.my_array = np.array(slices['my_array'])
+        self.other_arrays = np.array(slices['other_arrays'])
+        self.other_df = slices['other_df']
+
+        #TODO add other features here
+        #TODO will need to have the same fips codes in the same order in all other datasets
+        #TODO for the feature addition to be introduced here to work smoothly
+
+        distances = []
+        for vector in self.other_arrays:
+            distances.append(distance.euclidean(self.my_array, vector, w = self.weights))
+
+        assert len(distances) == self.other_arrays.shape[0], "Something went wrong"
+
+        self.other_df[f'distance to {self.fips}'] = distances
+
+        self.euclidean_kins = self.other_df.sort_values(by=self.other_df.columns[-1])
+        #TODO_Nikodem make sure this returns df with the original variable values, prior to normalization and rescaling
+
+
+    def plot_kins(self):
+        if self.outcome_var == "gdp":
+            self.data.get_gdp_long()
+            my_outcomes_long = self.data.gdp_long[self.data.gdp_long['GeoFIPS'] == self.fips].copy() 
+
+            fips_top = self.euclidean_kins['GeoFIPS'].iloc[:self.top].values
+
+            others_outcome_long = self.data.gdp_long[self.data.gdp_long['GeoFIPS'].isin(fips_top)]
+
+
+            fig = go.Figure()
+            fig.add_trace(go.Scatter(x=my_outcomes_long['Year'], y=my_outcomes_long['Value'],
+                                      mode='lines', name=my_outcomes_long['GeoName'].iloc[0],
+                                      line=dict(color='darkred', width=3),
+                                      text=my_outcomes_long['GeoName'].iloc[0], 
+                                      textposition='top right'
+                                      ))
+
+            #TODO_Nikodem add more shades and test on various settings of top
+            shades_of_grey = ['#333333', '#444444', '#555555', '#666666', '#777777'][:self.top]
+            pastel_colors = ['#FFC0CB', '#A9A9A9', '#87CEFA', '#FFD700', '#98FB98'][:self.top]
+
+            #R: not sure which look better
+
+            for i, geoname in enumerate(others_outcome_long['GeoName'].unique()):
+                subset = others_outcome_long[others_outcome_long['GeoName'] == geoname]
+                #line_color = shades_of_grey[i % len(shades_of_grey)]
+                line_color = pastel_colors[i % len(pastel_colors)]
+                fig.add_trace(go.Scatter(x=subset['Year'] + self.lag, y=subset['Value'],
+                                        mode='lines', name=subset['GeoName'].iloc[0],
+                                        line_color=line_color,
+                                        text=subset['GeoName'].iloc[0], 
+                                        textposition='top right'
+                                        ))
+
+            if self.lag >0:
+                fig.update_layout(
+                    shapes=[
+                        dict(
+                            type='line',
+                            x0=2021,
+                            x1=2021,
+                            y0=0,
+                            y1=1,
+                            xref='x',
+                            yref='paper',
+                            line=dict(color='darkgray', width=2)
+                        )
+                    ]
+                )
+
+                fig.add_annotation(
+                       text=f'their year {2021 - self.lag}',
+                        x=2021.,
+                        y=1.05, 
+                        xref='x',
+                        yref='paper',
+                        showarrow=False,
+                        font=dict(color='darkgray')
+                        )
+
+
+
+            fig.update_layout(
+                title=f'Top {self.top} locations whose GDP patterns up to year {2021-self.lag} are most similar to the current pattern of {self.name}', 
+                xaxis_title='Year',
+                yaxis_title='Chain-type quantity indexes for real GDP',
+                legend=dict(title='GeoName'),
+                template = "simple_white",
+            )
+
+            fig.show()
+
+
+#TODO_Nikodem add population clustering and warning if a population is much different,
+#especially if small
+
+
+
+
+
+
diff --git a/cities/utils/similarity_utils.py b/cities/utils/similarity_utils.py
@@ -0,0 +1,33 @@
+
+from typing import Dict, List, Union
+import numpy as np
+import pandas as pd
+
+
+def slice_with_lag(df: pd.DataFrame, fips: int, lag: int) -> Dict[str, np.ndarray]:
+    """
+    Takes a pandas dataframe, a location FIPS and a lag (years),
+    returns a dictionary with two numpy arrays:
+    - my_array: the array of features for the location with the given FIPS
+    - other_arrays: the array of features for all other locations
+    if lag>0, drops first lag columns from my_array and last lag columns from other_arrays.
+    Meant to be used prior to calculating similarity.
+    """
+    original_length = df.shape[0]
+
+    # this assumes input df has two columns of metadata, then the rest are features
+    # obey this convention with other datasets!
+    my_array = np.array(df[df['GeoFIPS'] == fips].values[0][2+lag:].copy())
+    other_df = df[df['GeoFIPS'] != fips].copy()
+
+    if lag >0:
+        other_df_cut = other_df.iloc[:, 2:-lag]
+        other_arrays = np.array(other_df_cut.values)
+    else:
+        other_df_cut = other_df.iloc[:, 2:]
+        other_arrays = np.array(other_df_cut.values)
+
+    assert other_arrays.shape[0] + 1 == original_length, "Dataset sizes don't match"
+    assert other_arrays.shape[1] == my_array.shape[0], "Lengths don't match"
+
+    return {'my_array': my_array, 'other_arrays': other_arrays, 'other_df': other_df}
diff --git a/data/processed/.gitkeep b/data/processed/.gitkeep
diff --git a/data/processed/.xdp-.~gdp_std_wide.csv-KVPFNt b/data/processed/.xdp-.~gdp_std_wide.csv-KVPFNt