From 3fcf4ace44ea14a2426da02f1a4012105910ce37 Mon Sep 17 00:00:00 2001
From: Riley Hales <rileyhales1@gmail.com>
Date: Sun, 14 Aug 2022 23:49:29 -0600
Subject: [PATCH 01/24] use parquet, streamline functions

---
 README.md                  |   8 +--
 conda-environment.yml      |  15 +++++
 examples/example_script.py |   2 +-
 saber/__init__.py          |   6 +-
 saber/_calibrate.py        | 118 +++++++++++++++++++++----------------
 saber/_vocab.py            |  28 +++++++--
 saber/_workflow.py         |   5 +-
 saber/analysis.py          |   2 +-
 saber/cluster.py           |  11 ++--
 saber/gis.py               |  47 +++++++++------
 saber/prep.py              |  98 +++++++++++++++++-------------
 saber/utils.py             |  15 +++--
 12 files changed, 219 insertions(+), 136 deletions(-)
 create mode 100644 conda-environment.yml

diff --git a/README.md b/README.md
index 42d10cc..dfde9bd 100755
--- a/README.md
+++ b/README.md
@@ -238,17 +238,17 @@ import saber as saber
 
 workdir = '/path/to/working/directory'
 
-saber.prep.historical_simulation(
+saber.prep.hindcast(
     workdir,
-    '/path/to/historical/simulation/netcdf.nc' # optional - if nc not stored in data_inputs folder
+    '/path/to/historical/simulation/netcdf.nc'  # optional - if nc not stored in data_inputs folder
 )
 saber.prep.hist_sim_table(
     workdir,
-    '/path/to/historical/simulation/netcdf.nc' # optional - if nc not stored in data_inputs folder
+    '/path/to/historical/simulation/netcdf.nc'  # optional - if nc not stored in data_inputs folder
 )
 saber.prep.observed_data(
     workdir,
-    '/path/to/obs/csv/directory' # optional - if csvs not stored in workdir/data_inputs/obs_csvs
+    '/path/to/obs/csv/directory'  # optional - if csvs not stored in workdir/data_inputs/obs_csvs
 )
 ```
 
diff --git a/conda-environment.yml b/conda-environment.yml
new file mode 100644
index 0000000..ff491bd
--- /dev/null
+++ b/conda-environment.yml
@@ -0,0 +1,15 @@
+name: saber
+channels:
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3
+  - geopandas
+  - pandas
+  - numpy
+  - contextily
+  - xarray
+  - netCDF4
+  - tslearn
+  - pyarrow
+  - kneed
diff --git a/examples/example_script.py b/examples/example_script.py
index bedb6a4..b114050 100644
--- a/examples/example_script.py
+++ b/examples/example_script.py
@@ -30,7 +30,7 @@
 
 # Prepare the observation and simulation data - Only need to do this step 1x ever
 print('Preparing data')
-saber.prep.historical_simulation(workdir)
+saber.prep.hindcast(workdir)
 
 # Generate the assignments table
 print('Generate Assignment Table')
diff --git a/saber/__init__.py b/saber/__init__.py
index 6b24826..8d13a4f 100644
--- a/saber/__init__.py
+++ b/saber/__init__.py
@@ -1,5 +1,5 @@
 from saber._workflow import prep_region, analyze_region
-from saber._calibrate import calibrate_stream, calibrate_region
+from saber._calibrate import calibrate, calibrate_region
 
 import saber.table
 import saber.prep
@@ -10,9 +10,11 @@
 import saber.validate
 import saber.analysis
 
+import saber._vocab
+
 
 __all__ = ['prep_region', 'analyze_region',
-           'calibrate_stream', 'calibrate_region',
+           'calibrate', 'calibrate_region',
            'table', 'prep', 'assign', 'gis', 'cluster', 'utils', 'validate', 'analysis']
 __author__ = 'Riley Hales'
 __version__ = '0.3.0'
diff --git a/saber/_calibrate.py b/saber/_calibrate.py
index 2172509..7ead055 100644
--- a/saber/_calibrate.py
+++ b/saber/_calibrate.py
@@ -15,48 +15,67 @@
 from ._vocab import asgn_gid_col
 from ._vocab import metric_list
 from ._vocab import metric_nc_name_list
-from ._vocab import sim_ts_pickle
+from ._vocab import hindcast_archive
 from ._vocab import cal_nc_name
 
 
-def calibrate_stream(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd.DataFrame,
-                     fix_seasonally: bool = True,
-                     drop_outliers: bool = False, outlier_threshold: int or float = 2.5,
-                     filter_scalar_fdc: bool = False, filter_range: tuple = (0, 80),
-                     extrapolate_method: str = 'nearest', fill_value: int or float = None,
-                     fit_gumbel: bool = False, gumbel_range: tuple = (25, 75), ) -> pd.DataFrame:
+def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd.DataFrame = None,
+              fix_seasonally: bool = True, empty_months: str = 'skip',
+              drop_outliers: bool = False, outlier_threshold: int or float = 2.5,
+              filter_scalar_fdc: bool = False, filter_range: tuple = (0, 80),
+              extrapolate_method: str = 'nearest', fill_value: int or float = None,
+              fit_gumbel: bool = False, gumbel_range: tuple = (25, 75), ) -> pd.DataFrame:
     """
-    Given the simulated and observed stream flow at location a, attempts to the remove the bias from simulated
-    stream flow at point b. This
+    Removes the bias from simulated discharge using the SABER method.
+
+    Given simulated and observed discharge at location A, removes bias from simulated data at point A.
+    Given simulated and observed discharge at location A, removes bias from simulated data at point B, if given B.
 
     Args:
-        sim_flow_a (pd.DataFrame): simulated hydrograph at point A
-        obs_flow_a (pd.DataFrame): observed hydrograph at point A
-        sim_flow_b (pd.DataFrame): simulated hydrograph at point B
+        sim_flow_a (pd.DataFrame): simulated hydrograph at point A. should contain a datetime index with daily values
+            and a single column of discharge values.
+        obs_flow_a (pd.DataFrame): observed hydrograph at point A. should contain a datetime index with daily values
+            and a single column of discharge values.
+        sim_flow_b (pd.DataFrame): (optional) simulated hydrograph at point B to correct using scalar flow duration
+            curve mapping and the bias relationship at point A. should contain a datetime index with daily values
+            and a single column of discharge values.
         fix_seasonally (bool): fix on a monthly (True) or annual (False) basis
+        empty_months (str): how to handle months in the simulated data where no observed data are available. Options:
+            "skip": ignore simulated data for months without
         drop_outliers (bool): flag to exclude outliers
         outlier_threshold (int or float): number of std deviations from mean to exclude from flow duration curve
         filter_scalar_fdc (bool):
         filter_range (tuple):
-        extrapolate_method (bool):
+        extrapolate_method (str):
         fill_value (int or float):
         fit_gumbel (bool):
         gumbel_range (tuple):
 
     Returns:
-        pd.DataFrame of the
+        pd.DataFrame with a DateTime index and columns with corrected flow, uncorrected flow, the scalar adjustment
+        factor applied to correct the discharge, and the percentile of the uncorrected flow (in the seasonal grouping,
+        if applicable).
     """
+    if sim_flow_b is None:
+        sim_flow_b = sim_flow_a.copy()
     if fix_seasonally:
         # list of the unique months in the historical simulation. should always be 1->12 but just in case...
         monthly_results = []
         for month in sorted(set(sim_flow_a.index.strftime('%m'))):
-            # filter data to only be current iteration's month
-            mon_sim_data = sim_flow_a[sim_flow_a.index.month == int(month)].dropna()
+            # filter data to current iteration's month
             mon_obs_data = obs_flow_a[obs_flow_a.index.month == int(month)].dropna()
+
+            if mon_obs_data.empty:
+                if empty_months == 'skip':
+                    continue
+                else:
+                    raise ValueError(f'Invalid value for argument "empty_months". Given: {empty_months}.')
+
+            mon_sim_data = sim_flow_a[sim_flow_a.index.month == int(month)].dropna()
             mon_cor_data = sim_flow_b[sim_flow_b.index.month == int(month)].dropna()
-            monthly_results.append(calibrate_stream(
+            monthly_results.append(calibrate(
                 mon_sim_data, mon_obs_data, mon_cor_data,
-                fix_seasonally=False,
+                fix_seasonally=False, empty_months=empty_months,
                 drop_outliers=drop_outliers, outlier_threshold=outlier_threshold,
                 filter_scalar_fdc=filter_scalar_fdc, filter_range=filter_range,
                 extrapolate_method=extrapolate_method, fill_value=fill_value,
@@ -67,7 +86,7 @@ def calibrate_stream(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flo
 
     # compute the fdc for paired sim/obs data and compute scalar fdc, either with or without outliers
     if drop_outliers:
-        # drop outlier data
+        # drop outlier data before creating flow duration curves
         # https://stackoverflow.com/questions/23199796/detect-and-exclude-outliers-in-pandas-data-frame
         sim_fdc = compute_fdc(
             sim_flow_a[(np.abs(stats.zscore(sim_flow_a)) < outlier_threshold).all(axis=1)])
@@ -77,39 +96,16 @@ def calibrate_stream(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flo
         sim_fdc = compute_fdc(sim_flow_a)
         obs_fdc = compute_fdc(obs_flow_a)
 
-    scalar_fdc = compute_scalar_fdc(obs_fdc['flow'].values.flatten(), sim_fdc['flow'].values.flatten())
+    scalar_fdc = compute_scalar_fdc(sim_fdc['flow'].values.flatten(), obs_fdc['flow'].values.flatten())
 
     if filter_scalar_fdc:
-        scalar_fdc = scalar_fdc[scalar_fdc['Exceedance Probability'] >= filter_range[0]]
-        scalar_fdc = scalar_fdc[scalar_fdc['Exceedance Probability'] <= filter_range[1]]
-
-    # Convert the percentiles
-    if extrapolate_method == 'nearest':
-        to_scalar = interpolate.interp1d(scalar_fdc.values[:, 0], scalar_fdc.values[:, 1],
-                                         fill_value='extrapolate', kind='nearest')
-    elif extrapolate_method == 'value':
-        to_scalar = interpolate.interp1d(scalar_fdc.values[:, 0], scalar_fdc.values[:, 1],
-                                         fill_value=fill_value, bounds_error=False)
-    elif extrapolate_method == 'linear':
-        to_scalar = interpolate.interp1d(scalar_fdc.values[:, 0], scalar_fdc.values[:, 1],
-                                         fill_value='extrapolate')
-    elif extrapolate_method == 'average':
-        to_scalar = interpolate.interp1d(scalar_fdc.values[:, 0], scalar_fdc.values[:, 1],
-                                         fill_value=np.mean(scalar_fdc.values[:, 1]), bounds_error=False)
-    elif extrapolate_method == 'max' or extrapolate_method == 'maximum':
-        to_scalar = interpolate.interp1d(scalar_fdc.values[:, 0], scalar_fdc.values[:, 1],
-                                         fill_value=np.max(scalar_fdc.values[:, 1]), bounds_error=False)
-    elif extrapolate_method == 'min' or extrapolate_method == 'minimum':
-        to_scalar = interpolate.interp1d(scalar_fdc.values[:, 0], scalar_fdc.values[:, 1],
-                                         fill_value=np.min(scalar_fdc.values[:, 1]), bounds_error=False)
-    else:
-        raise ValueError('Invalid extrapolation method provided')
+        scalar_fdc = scalar_fdc[scalar_fdc['p_exceed'].between(filter_range[0], filter_range[1])]
 
     # determine the percentile of each uncorrected flow using the monthly fdc
     values = sim_flow_b.values.flatten()
     percentiles = [stats.percentileofscore(values, a) for a in values]
     scalars = to_scalar(percentiles)
-    values = values * scalars
+    values = values / scalars
 
     if fit_gumbel:
         tmp = pd.DataFrame(np.transpose([values, percentiles]), columns=('q', 'p'))
@@ -134,15 +130,37 @@ def calibrate_stream(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flo
 
         values = tmp['q'].values
 
-    return pd.DataFrame(data=np.transpose([values, scalars, percentiles]),
+    return pd.DataFrame(data=np.transpose([values, sim_flow_b.values.flatten(), scalars, percentiles]),
                         index=sim_flow_b.index.to_list(),
-                        columns=('flow', 'scalars', 'percentile'))
+                        columns=('flow', 'uncorrected', 'scalars', 'percentile'))
+
+
+def _make_interpolator(x_input: np.array, y_output: np.array, extrap_method: str = 'nearest',
+                       const: int or float = None) -> interpolate.interp1d:
+    # make interpolator which converts the percentiles to scalars
+    if extrap_method == 'nearest':
+        interpolator = interpolate.interp1d(x_input, y_output, fill_value='extrapolate', kind='nearest')
+    elif extrap_method == 'const':
+        if const is None:
+            raise ValueError('Must provide the const kwarg when extrap_method="const"')
+        interpolator = interpolate.interp1d(x_input, y_output, fill_value=const, bounds_error=False)
+    elif extrap_method == 'linear':
+        interpolator = interpolate.interp1d(x_input, y_output, fill_value='extrapolate')
+    elif extrap_method == 'average':
+        interpolator = interpolate.interp1d(x_input, y_output, fill_value=np.mean(y_output), bounds_error=False)
+    elif extrap_method == 'max' or extrap_method == 'maximum':
+        interpolator = interpolate.interp1d(x_input, y_output, fill_value=np.max(y_output), bounds_error=False)
+    elif extrap_method == 'min' or extrap_method == 'minimum':
+        interpolator = interpolate.interp1d(x_input, y_output, fill_value=np.min(y_output), bounds_error=False)
+    else:
+        raise ValueError('Invalid extrapolation method provided')
+    return interpolator
 
 
 def calibrate_region(workdir: str, assign_table: pd.DataFrame,
                      gauge_table: pd.DataFrame = None, obs_data_dir: str = None) -> None:
     """
-    Creates a netCDF with the
+    Creates a netCDF of all corrected flows in a region.
 
     Args:
         workdir: path to project working directory
@@ -159,7 +177,7 @@ def calibrate_region(workdir: str, assign_table: pd.DataFrame,
         obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
 
     bcs_nc_path = os.path.join(workdir, cal_nc_name)
-    ts = pd.read_pickle(os.path.join(workdir, 'data_processed', sim_ts_pickle))
+    ts = pd.read_pickle(os.path.join(workdir, 'data_processed', hindcast_archive))
 
     # create the new netcdf
     bcs_nc = nc.Dataset(bcs_nc_path, 'w')
@@ -238,7 +256,7 @@ def calibrate_region(workdir: str, assign_table: pd.DataFrame,
             continue
 
         try:
-            calibrated_df = calibrate_stream(ts[[asgn_mid]], obs_df, ts[[model_id]], )
+            calibrated_df = calibrate(ts[[asgn_mid]], obs_df, ts[[model_id]], )
             c_array[:, idx] = calibrated_df['flow'].values
             p_array[:, idx] = calibrated_df['percentile'].values
             s_array[:, idx] = calibrated_df['scalars'].values
diff --git a/saber/_vocab.py b/saber/_vocab.py
index 29a077d..dcde03e 100644
--- a/saber/_vocab.py
+++ b/saber/_vocab.py
@@ -1,5 +1,10 @@
+import os
+import glob
+import pandas as pd
+
 # assign table and gis_input file required column names
-mid_col = 'model_id'
+# mid_col = 'model_id'
+mid_col = 'COMID'
 gid_col = 'gauge_id'
 asgn_mid_col = 'assigned_model_id'
 asgn_gid_col = 'assigned_gauge_id'
@@ -8,11 +13,26 @@
 area_col = 'drain_area'
 order_col = 'stream_order'
 
+# scaffolded folders
+gis_inp = 'gis_inputs'
+gis_out = 'gis_outputs'
+
 # name of some files produced by the algorithm
 cluster_count_file = 'best-fit-cluster-count.json'
 cal_nc_name = 'calibrated_simulated_flow.nc'
-sim_ts_pickle = 'sim_time_series.pickle'
+hindcast_archive = 'hindcast.parquet'
+hindcast_fdc_archive = 'hindcast_fdc.parquet'
+drain_table = 'drain_table.parquet'
+gauge_table = 'gauge_table.parquet'
 
 # metrics computed on validation sets
-metric_list = ['ME', 'MAE', 'RMSE', 'NRMSE (Mean)', 'MAPE', 'NSE', 'KGE (2012)']
-metric_nc_name_list = ['ME', 'MAE', 'RMSE', 'NRMSE', 'MAPE', 'NSE', 'KGE2012']
+metric_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE (2012)']
+metric_nc_name_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE2012']
+
+
+def read_drain_table(workdir: str) -> pd.DataFrame:
+    return pd.read_parquet(os.path.join(workdir, gis_inp, drain_table))
+
+
+def guess_hindcast_path(workdir: str) -> str:
+    return glob.glob(os.path.join(workdir, 'data_inputs', '*.nc*'))[0]
diff --git a/saber/_workflow.py b/saber/_workflow.py
index d313aba..0a97402 100644
--- a/saber/_workflow.py
+++ b/saber/_workflow.py
@@ -1,6 +1,6 @@
 import pandas as pd
 
-from .prep import historical_simulation
+from .prep import hindcast
 
 from .table import gen, cache
 from .cluster import generate, summarize
@@ -10,8 +10,7 @@
 
 
 def prep_region(workdir: str) -> None:
-    historical_simulation(workdir)
-    # observed_data(workdir)
+    hindcast(workdir)
     return
 
 
diff --git a/saber/analysis.py b/saber/analysis.py
index b6ada4a..56a0f40 100644
--- a/saber/analysis.py
+++ b/saber/analysis.py
@@ -1,6 +1,6 @@
 import os
 
-import grids
+# import grids
 import pandas as pd
 import plotly.graph_objects as go
 
diff --git a/saber/cluster.py b/saber/cluster.py
index 122e78b..cf6f5b7 100755
--- a/saber/cluster.py
+++ b/saber/cluster.py
@@ -17,7 +17,7 @@
 
 def generate(workdir: str) -> None:
     """
-    Creates trained kmeans model pickle files and plots of the results saved as png images
+    Trains DTW kmeans model pickle files and plots of the results saved as png images
 
     Args:
         workdir: path to the project directory
@@ -27,9 +27,9 @@ def generate(workdir: str) -> None:
     """
     best_fit = {}
 
-    for table in glob.glob(os.path.join(workdir, 'data_processed', '*.csv')):
+    for table in glob.glob(os.path.join(workdir, 'data_processed', '*_fdc_*.parquet.gzip')):
         # read the data and transform
-        time_series = pd.read_csv(table, index_col=0).dropna(axis=1)
+        time_series = pd.read_parquet(table)
         time_series = np.transpose(time_series.values)
         time_series = TimeSeriesScalerMeanVariance().fit_transform(time_series)
 
@@ -37,7 +37,7 @@ def generate(workdir: str) -> None:
         dataset = os.path.splitext(os.path.basename(table))[0]
         inertia = {'number': [], 'inertia': []}
 
-        for num_cluster in range(2, 11):
+        for num_cluster in range(2, 16):
             # build the kmeans model
             km = TimeSeriesKMeans(n_clusters=num_cluster, random_state=0)
             km.fit_predict(time_series)
@@ -50,6 +50,7 @@ def generate(workdir: str) -> None:
             # generate a plot of the clusters
             size = time_series.shape[1]
             fig = plt.figure(figsize=(30, 15), dpi=450)
+            fig.suptitle("Dynamic Time Warping Clustering")
             assigned_clusters = km.labels_
             for i in range(num_cluster):
                 plt.subplot(2, math.ceil(num_cluster / 2), i + 1)
@@ -59,8 +60,6 @@ def generate(workdir: str) -> None:
                 plt.xlim(0, size)
                 plt.ylim(-3.5, 3.5)
                 plt.text(0.55, 0.85, f'Cluster {i}', transform=plt.gca().transAxes)
-                if i == math.floor(num_cluster / 4):
-                    plt.title("Euclidean $k$-means")
             plt.tight_layout()
             fig.savefig(os.path.join(workdir, 'kmeans_images', f'{dataset}-{num_cluster}-clusters.png'))
             plt.close(fig)
diff --git a/saber/gis.py b/saber/gis.py
index 5c494e8..40a2759 100644
--- a/saber/gis.py
+++ b/saber/gis.py
@@ -43,7 +43,7 @@ def generate_all(workdir: str, assign_table: pd.DataFrame, drain_shape: str, pre
 def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '',
                        id_column: str = mid_col) -> None:
     """
-    Creates geojsons (in workdir/gis_outputs) for each unique value in the assignment column
+    Creates Geopackage files in workdir/gis_outputs for each unique value in the assignment column
 
     Args:
         workdir: the path to the working directory for the project
@@ -63,18 +63,18 @@ def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: st
     for reason in set(assign_table[reason_col].dropna().values):
         ids = assign_table[assign_table[reason_col] == reason][mid_col].values
         subset = dl[dl[id_column].isin(ids)]
-        name = f'{prefix}{"_" if prefix else ""}assignments_{reason}.json'
+        name = f'{prefix}{"_" if prefix else ""}assignments_{reason}.gpkg'
         if subset.empty:
             continue
         else:
-            subset.to_file(os.path.join(save_dir, name), driver='GeoJSON')
+            subset.to_file(os.path.join(save_dir, name))
     return
 
 
 def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '',
                     id_column: str = mid_col) -> None:
     """
-    Creates GIS files (in workdir/gis_outputs) of the drainage lines based on which fdc cluster they were assigned to
+    Creates Geopackage files in workdir/gis_outputs of the drainage lines based on the fdc cluster they were assigned to
 
     Args:
         workdir: the path to the working directory for the project
@@ -90,19 +90,19 @@ def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_shape: str,
     cluster_types = [a for a in assign_table if 'cluster' in a]
     for ctype in cluster_types:
         for gnum in sorted(set(assign_table[ctype].dropna().values)):
-            savepath = os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}{ctype}-{int(gnum)}.json')
+            savepath = os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}{ctype}-{int(gnum)}.gpkg')
             ids = assign_table[assign_table[ctype] == gnum][mid_col].values
             if dl_gdf[dl_gdf[id_column].isin(ids)].empty:
                 continue
             else:
-                dl_gdf[dl_gdf[id_column].isin(ids)].to_file(savepath, driver='GeoJSON')
+                dl_gdf[dl_gdf[id_column].isin(ids)].to_file(savepath)
     return
 
 
 def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '',
                        id_column: str = mid_col) -> None:
     """
-    Creates geojsons (in workdir/gis_outputs) of the drainage lines which haven't been assigned a gauge yet
+    Creates Geopackage files in workdir/gis_outputs of the drainage lines which haven't been assigned a gauge yet
 
     Args:
         workdir: the path to the working directory for the project
@@ -120,15 +120,15 @@ def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_shape: st
     if subset.empty:
         warnings.warn('Empty filter: No streams are unassigned')
         return
-    savepath = os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}assignments_unassigned.json')
-    subset.to_file(savepath, driver='GeoJSON')
+    savepath = os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}assignments_unassigned.gpkg')
+    subset.to_file(savepath)
     return
 
 
 def clip_by_ids(workdir: str, ids: list, drain_shape: str, prefix: str = '',
                 id_column: str = mid_col) -> None:
     """
-    Creates geojsons (in workdir/gis_outputs) of the subset of 'drain_shape' with an ID in the specified list
+    Creates Geopackage files in workdir/gis_outputs of the subset of 'drain_shape' with an ID in the specified list
 
     Args:
         workdir: path to the project directory
@@ -142,14 +142,14 @@ def clip_by_ids(workdir: str, ids: list, drain_shape: str, prefix: str = '',
     """
     dl = gpd.read_file(drain_shape)
     save_dir = os.path.join(workdir, 'gis_outputs')
-    name = f'{prefix}{"_" if prefix else ""}id_subset.json'
-    dl[dl[id_column].isin(ids)].to_file(os.path.join(save_dir, name), driver='GeoJSON')
+    name = f'{prefix}{"_" if prefix else ""}id_subset.gpkg'
+    dl[dl[id_column].isin(ids)].to_file(os.path.join(save_dir, name))
     return
 
 
 def validation_maps(workdir: str, gauge_shape: str, val_table: pd.DataFrame = None, prefix: str = '') -> None:
     """
-    Creates geojsons (in workdir/gis_outputs) of subsets of the gauge_shape.
+    Creates Geopackage files in workdir/gis_outputs of subsets of the gauge_shape.
     1 is the fill gauge shape with added attribute columns for all the computed stats. There are 2 for each of the 5
     validation groups; 1 which shows the gauges included in the validation set and 1 which shows gauges that were
     excluded from the validation set.
@@ -170,7 +170,7 @@ def validation_maps(workdir: str, gauge_shape: str, val_table: pd.DataFrame = No
     # merge gauge table with the validation table
     gdf = gpd.read_file(gauge_shape)
     gdf = gdf.merge(val_table, on=gid_col, how='inner')
-    gdf.to_file(os.path.join(save_dir, 'gauges_with_validation_stats.json'), driver='GeoJSON')
+    gdf.to_file(os.path.join(save_dir, 'gauges_with_validation_stats.gpkg'))
 
     core_columns = [mid_col, gid_col, 'geometry']
 
@@ -182,12 +182,12 @@ def validation_maps(workdir: str, gauge_shape: str, val_table: pd.DataFrame = No
             gdf_sub = gdf[cols_to_select]
             gdf_sub = gdf_sub.rename(columns={f'{metric}_{val_set}': metric})
 
-            name = f'{prefix}{"_" if prefix else ""}valset_{val_set}_{metric}_included.json'
-            gdf_sub[gdf_sub[val_set] == 1].to_file(os.path.join(save_dir, name), driver='GeoJSON')
+            name = f'{prefix}{"_" if prefix else ""}valset_{val_set}_{metric}_included.gpkg'
+            gdf_sub[gdf_sub[val_set] == 1].to_file(os.path.join(save_dir, name))
 
-            name = f'{prefix}{"_" if prefix else ""}valset_{val_set}_{metric}_excluded.json'
+            name = f'{prefix}{"_" if prefix else ""}valset_{val_set}_{metric}_excluded.gpkg'
             exc = gdf_sub[gdf_sub[val_set] == 0]
-            exc.to_file(os.path.join(save_dir, name), driver='GeoJSON')
+            exc.to_file(os.path.join(save_dir, name))
             if metric == 'KGE2012':
                 histomaps(exc, metric, val_set, workdir)
 
@@ -195,6 +195,17 @@ def validation_maps(workdir: str, gauge_shape: str, val_table: pd.DataFrame = No
 
 
 def histomaps(gdf: gpd.GeoDataFrame, metric: str, prct: str, workdir: str) -> None:
+    """
+
+    Args:
+        gdf:
+        metric:
+        prct:
+        workdir:
+
+    Returns:
+
+    """
     core_columns = [mid_col, gid_col, 'geometry']
     # world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
     # world.plot(ax=axm, color='white', edgecolor='black')
diff --git a/saber/prep.py b/saber/prep.py
index f08f795..3d09c6a 100755
--- a/saber/prep.py
+++ b/saber/prep.py
@@ -1,76 +1,92 @@
-import glob
 import os
 
 import grids
 import pandas as pd
-import xarray as xr
+import geopandas as gpd
+import netCDF4 as nc
+import numpy as np
 
 from .utils import compute_fdc
 
 from ._vocab import mid_col
+from ._vocab import hindcast_archive
+from ._vocab import hindcast_fdc_archive
+from ._vocab import read_drain_table
+from ._vocab import guess_hindcast_path
 
+__all__ = ['gis_tables', 'hindcast', 'scaffold_workdir']
 
-def guess_hist_sim_path(workdir: str) -> str:
-    return glob.glob(os.path.join(workdir, 'data_inputs', '*.nc*'))[0]
+
+def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> None:
+    """
+    Generate copies of the drainage line attribute tables in parquet format using the Saber package vocabulary
+
+    Args:
+        workdir: path to the working directory for the project
+        gauge_gis: path to the GIS dataset (e.g. geopackage) for the gauge locations (points)
+        drain_gis: path to the GIS dataset (e.g. geopackage) for the drainage line locations (polylines)
+
+    Returns:
+        None
+    """
+    if gauge_gis is not None:
+        gdf = gpd.read_file(gauge_gis).drop('geometry', axis=1)
+        gdf[['x', 'y']] = gdf.geometry.apply(lambda x: [x.x, x.y])
+        pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet(
+            os.path.join(workdir, 'gis_inputs', 'gauge_table.parquet'))
+    if drain_gis is not None:
+        gdf = gpd.read_file(drain_gis).drop('geometry', axis=1)
+        gdf[['x', 'y']] = gdf.geometry.apply(lambda x: [x.x, x.y])
+        pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet(
+            os.path.join(workdir, 'gis_inputs', 'drain_table.parquet'))
+    return
 
 
-def historical_simulation(workdir: str, hist_nc_path: str = None) -> None:
+def hindcast(workdir: str, hind_nc_path: str = None, ) -> None:
     """
-    Creates sim-fdc.csv and sim_time_series.pickle in workdir/data_processed for geoglows historical simulation data
+    Creates sim-fdc.parquet and sim_time_series.parquet in workdir/data_processed for geoglows historical simulation data
 
     Args:
         workdir: path to the working directory for the project
-        hist_nc_path: path to the historical simulation netcdf if not located at workdir/data_simulated/*.nc
+        hind_nc_path: path to the hindcast or historical simulation netcdf if not located at workdir/data_simulated/*.nc
 
     Returns:
         None
     """
-    if hist_nc_path is None:
-        hist_nc_path = guess_hist_sim_path(workdir)
+    if hind_nc_path is None:
+        hind_nc_path = guess_hindcast_path(workdir)
 
     # read the assignments table
-    drain_table = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'drain_table.csv'))
+    drain_table = read_drain_table(workdir)
     model_ids = list(set(sorted(drain_table[mid_col].tolist())))
 
-    # open the historical data netcdf file
-    hist_nc = xr.open_dataset(hist_nc_path)
-
-    # start dataframes for the flow duration curve (fdc) and the monthly averages (ma) using the first comid in the list
-    first_id = model_ids.pop(0)
-    first_data = hist_nc.sel(rivid=first_id).Qout.to_dataframe()['Qout']
-    fdc_df = compute_fdc(first_data.tolist(), col_name=first_id)
-    # ma_df = first_data.groupby(first_data.index.strftime('%m')).mean().to_frame(name=first_id)
-
-    # for each remaining stream ID in the list, merge/append the fdc and ma with the previously created dataframes
-    for model_id in model_ids:
-        data = hist_nc.sel(rivid=model_id).Qout.to_dataframe()['Qout']
-        fdc_df = fdc_df.merge(compute_fdc(data.tolist(), col_name=model_id),
-                              how='outer', left_index=True, right_index=True)
-
-    fdc_df.to_csv(os.path.join(workdir, 'data_processed', 'sim-fdc.csv'))
-
-    # create the time series table of simulated flow values
-    coords = drain_table[[mid_col]]
-    coords.loc[:, 'time'] = None
-    coords = coords[['time', 'model_id']].values.tolist()
-    ts = grids.TimeSeries([hist_nc_path, ], 'Qout', ('time', 'rivid'))
-    ts = ts.multipoint(*coords)
-    ts.set_index('datetime', inplace=True)
-    ts.index = pd.to_datetime(ts.index, unit='s')
-    ts.columns = drain_table[mid_col].values.flatten()
-    ts.index = ts.index.tz_localize('UTC')
-    ts.to_pickle(os.path.join(workdir, 'data_processed', 'sim_time_series.pickle')
-)
+    # read the hindcast netcdf and convert to dataframe
+    hnc = nc.Dataset(hind_nc_path)
+    ids = pd.Series(hnc['rivid'][:])
+    ids_selector = ids.isin(model_ids)
+    df = pd.DataFrame(
+        hnc['Qout'][:, ids_selector],
+        columns=ids[ids_selector].astype(str),
+        index=pd.to_datetime(hnc.variables['time'][:], unit='s')
+    )
+    df.index.name = 'datetime'
+    df.to_parquet(os.path.join(workdir, 'data_processed', 'hindcast_series_table.parquet.gzip'), compression='gzip')
+
+    exceed_prob = np.linspace(0, 100, 401)[::-1]
+    df = df.apply(lambda x: np.transpose(np.nanpercentile(x, exceed_prob)))
+    df.index = exceed_prob
+    df.index.name = 'exceed_prob'
+    df.to_parquet(os.path.join(workdir, 'data_processed', 'hindcast_fdc_table.parquet.gzip'), compression='gzip')
     return
 
 
 def scaffold_workdir(path: str, include_validation: bool = True) -> None:
     """
-    Creates the correct directories for an RBC project within the a specified directory
+    Creates the correct directories for a Saber project within the specified directory
 
     Args:
         path: the path to a directory where you want to create directories
-        include_validation: boolean, indicates whether or not to create the validation folder
+        include_validation: boolean, indicates whether to create the validation folder
 
     Returns:
         None
diff --git a/saber/utils.py b/saber/utils.py
index 7dd3b80..3732dc1 100644
--- a/saber/utils.py
+++ b/saber/utils.py
@@ -42,12 +42,15 @@ def compute_fdc(flows: np.array, steps: int = 500, exceed: bool = True, col_name
     return pd.DataFrame(flows, columns=[col_name, ], index=percentiles)
 
 
-def compute_scalar_fdc(first_fdc, second_fdc):
-    first_fdc = compute_fdc(first_fdc)
-    second_fdc = compute_fdc(second_fdc)
-    ratios = np.divide(first_fdc['flow'].values.flatten(), second_fdc['flow'].values.flatten())
-    columns = (first_fdc.columns[0], 'Scalars')
-    scalars_df = pd.DataFrame(np.transpose([first_fdc.values[:, 0], ratios]), columns=columns)
+def compute_scalar_fdc(sim_fdc: pd.DataFrame, obs_fdc: pd.DataFrame):
+    sim_fdc = compute_fdc(sim_fdc)
+    obs_fdc = compute_fdc(obs_fdc)
+    ratios = np.divide(sim_fdc['flow'].values.flatten(), obs_fdc['flow'].values.flatten())
+    columns = (sim_fdc.columns[0], 'p_exceed', 'scalars')
+    scalars_df = pd.DataFrame(
+        np.transpose([sim_fdc.values[:, 0], sim_fdc['p_exceed'].values.flatten(), ratios]),
+        columns=columns
+    )
     scalars_df.replace(np.inf, np.nan, inplace=True)
     scalars_df.dropna(inplace=True)
 

From 15c837e0e9e499acc005bebb32ec3db9326b1236 Mon Sep 17 00:00:00 2001
From: Riley Hales <rileyhales1@gmail.com>
Date: Mon, 15 Aug 2022 11:57:15 -0600
Subject: [PATCH 02/24] improve paths

---
 saber/_calibrate.py |  4 ++--
 saber/_vocab.py     | 32 +++++++++++++++++++++++++-------
 saber/_workflow.py  |  5 -----
 saber/prep.py       | 23 +++++++++++------------
 saber/table.py      | 17 ++++++++++-------
 5 files changed, 48 insertions(+), 33 deletions(-)

diff --git a/saber/_calibrate.py b/saber/_calibrate.py
index 7ead055..4b848dd 100644
--- a/saber/_calibrate.py
+++ b/saber/_calibrate.py
@@ -15,7 +15,7 @@
 from ._vocab import asgn_gid_col
 from ._vocab import metric_list
 from ._vocab import metric_nc_name_list
-from ._vocab import hindcast_archive
+from ._vocab import hindcast_table
 from ._vocab import cal_nc_name
 
 
@@ -177,7 +177,7 @@ def calibrate_region(workdir: str, assign_table: pd.DataFrame,
         obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
 
     bcs_nc_path = os.path.join(workdir, cal_nc_name)
-    ts = pd.read_pickle(os.path.join(workdir, 'data_processed', hindcast_archive))
+    ts = pd.read_pickle(os.path.join(workdir, 'data_processed', hindcast_table))
 
     # create the new netcdf
     bcs_nc = nc.Dataset(bcs_nc_path, 'w')
diff --git a/saber/_vocab.py b/saber/_vocab.py
index dcde03e..db471f4 100644
--- a/saber/_vocab.py
+++ b/saber/_vocab.py
@@ -14,16 +14,18 @@
 order_col = 'stream_order'
 
 # scaffolded folders
-gis_inp = 'gis_inputs'
+tables = 'tables'
+data_in = 'data_inputs'
 gis_out = 'gis_outputs'
+kmeans_out = 'kmeans_outputs'
 
 # name of some files produced by the algorithm
 cluster_count_file = 'best-fit-cluster-count.json'
 cal_nc_name = 'calibrated_simulated_flow.nc'
-hindcast_archive = 'hindcast.parquet'
-hindcast_fdc_archive = 'hindcast_fdc.parquet'
-drain_table = 'drain_table.parquet'
-gauge_table = 'gauge_table.parquet'
+hindcast_table = 'hindcast_series_table.parquet.gzip'
+hindcast_fdc_table = 'hindcast_fdc_table.parquet.gzip'
+drain_table = 'drain_table.parquet.gzip'
+gauge_table = 'gauge_table.parquet.gzip'
 
 # metrics computed on validation sets
 metric_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE (2012)']
@@ -31,8 +33,24 @@
 
 
 def read_drain_table(workdir: str) -> pd.DataFrame:
-    return pd.read_parquet(os.path.join(workdir, gis_inp, drain_table))
+    return pd.read_parquet(os.path.join(workdir, tables, drain_table))
+
+
+def read_gauge_table(workdir: str) -> pd.DataFrame:
+    return pd.read_parquet(os.path.join(workdir, tables, gauge_table))
+
+
+# todo: make everything in prep and other modules use this function for getting table paths
+def get_table_path(workdir: str, table_name: str) -> str:
+    if table_name == 'hindcast_series':
+        return os.path.join(workdir, data_in, hindcast_table)
+    elif table_name == 'hindcast_fdc':
+        return os.path.join(workdir, data_in, hindcast_fdc_table)
+    elif table_name == 'drain_table':
+        return os.path.join(workdir, tables, drain_table)
+    elif table_name == 'gauge_table':
+        return os.path.join(workdir, tables, gauge_table)
 
 
 def guess_hindcast_path(workdir: str) -> str:
-    return glob.glob(os.path.join(workdir, 'data_inputs', '*.nc*'))[0]
+    return glob.glob(os.path.join(workdir, data_in, '*.nc*'))[0]
diff --git a/saber/_workflow.py b/saber/_workflow.py
index 0a97402..494b693 100644
--- a/saber/_workflow.py
+++ b/saber/_workflow.py
@@ -9,11 +9,6 @@
 from ._calibrate import calibrate_region
 
 
-def prep_region(workdir: str) -> None:
-    hindcast(workdir)
-    return
-
-
 def analyze_region(workdir: str, drain_shape: str, gauge_table: pd.DataFrame = None, obs_data_dir: str = None) -> None:
     # Generate the assignments table
     print("gen assign_table")
diff --git a/saber/prep.py b/saber/prep.py
index 3d09c6a..81478cd 100755
--- a/saber/prep.py
+++ b/saber/prep.py
@@ -9,8 +9,8 @@
 from .utils import compute_fdc
 
 from ._vocab import mid_col
-from ._vocab import hindcast_archive
-from ._vocab import hindcast_fdc_archive
+from ._vocab import hindcast_table
+from ._vocab import hindcast_fdc_table
 from ._vocab import read_drain_table
 from ._vocab import guess_hindcast_path
 
@@ -33,18 +33,19 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No
         gdf = gpd.read_file(gauge_gis).drop('geometry', axis=1)
         gdf[['x', 'y']] = gdf.geometry.apply(lambda x: [x.x, x.y])
         pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet(
-            os.path.join(workdir, 'gis_inputs', 'gauge_table.parquet'))
+            os.path.join(workdir, 'tables', 'gauge_table.parquet.gzip'), compression='gzip')
     if drain_gis is not None:
         gdf = gpd.read_file(drain_gis).drop('geometry', axis=1)
         gdf[['x', 'y']] = gdf.geometry.apply(lambda x: [x.x, x.y])
         pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet(
-            os.path.join(workdir, 'gis_inputs', 'drain_table.parquet'))
+            os.path.join(workdir, 'tables', 'drain_table.parquet.gzip'), compression='gzip')
     return
 
 
 def hindcast(workdir: str, hind_nc_path: str = None, ) -> None:
     """
-    Creates sim-fdc.parquet and sim_time_series.parquet in workdir/data_processed for geoglows historical simulation data
+    Creates hindcast_series_table.parquet.gzip and hindcast_fdc_table.parquet.gzip in the workdir/tables directory
+    for the GEOGloWS hindcast data
 
     Args:
         workdir: path to the working directory for the project
@@ -70,13 +71,13 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None:
         index=pd.to_datetime(hnc.variables['time'][:], unit='s')
     )
     df.index.name = 'datetime'
-    df.to_parquet(os.path.join(workdir, 'data_processed', 'hindcast_series_table.parquet.gzip'), compression='gzip')
+    df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_series_table.parquet.gzip'), compression='gzip')
 
     exceed_prob = np.linspace(0, 100, 401)[::-1]
     df = df.apply(lambda x: np.transpose(np.nanpercentile(x, exceed_prob)))
     df.index = exceed_prob
     df.index.name = 'exceed_prob'
-    df.to_parquet(os.path.join(workdir, 'data_processed', 'hindcast_fdc_table.parquet.gzip'), compression='gzip')
+    df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_table.parquet.gzip'), compression='gzip')
     return
 
 
@@ -93,12 +94,10 @@ def scaffold_workdir(path: str, include_validation: bool = True) -> None:
     """
     if not os.path.isdir(path):
         os.mkdir(path)
-    os.mkdir(os.path.join(path, 'data_inputs'))
-    os.mkdir(os.path.join(path, 'data_processed'))
-    os.mkdir(os.path.join(path, 'gis_inputs'))
+    os.mkdir(os.path.join(path, 'tables'))
+    os.mkdir(os.path.join(path, 'inputs'))
     os.mkdir(os.path.join(path, 'gis_outputs'))
-    os.mkdir(os.path.join(path, 'kmeans_models'))
-    os.mkdir(os.path.join(path, 'kmeans_images'))
+    os.mkdir(os.path.join(path, 'kmeans_outputs'))
     if include_validation:
         os.mkdir(os.path.join(path, 'validation_runs'))
     return
diff --git a/saber/table.py b/saber/table.py
index 076da48..01d9c9b 100644
--- a/saber/table.py
+++ b/saber/table.py
@@ -21,7 +21,7 @@ def get_path(workdir: str) -> str:
     Returns:
         absolute path of the assign_table
     """
-    return os.path.join(workdir, 'assign_table.csv')
+    return os.path.join(workdir, 'assign_table.parquet.gzip')
 
 
 def gen(workdir) -> pd.DataFrame:
@@ -35,8 +35,8 @@ def gen(workdir) -> pd.DataFrame:
         None
     """
     # read and merge the tables
-    drain_df = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'drain_table.csv'))
-    gauge_df = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'gauge_table.csv'))
+    drain_df = pd.read_parquet(os.path.join(workdir, 'tables', 'drain_table.parquet.gzip'))
+    gauge_df = pd.read_parquet(os.path.join(workdir, 'tables', 'gauge_table.parquet.gzip'))
     assign_table = pd.merge(drain_df, gauge_df, on=mid_col, how='outer')
 
     # create the new columns
@@ -44,6 +44,9 @@ def gen(workdir) -> pd.DataFrame:
     assign_table[asgn_gid_col] = np.nan
     assign_table[reason_col] = np.nan
 
+    # cache the table
+    cache(workdir, assign_table)
+
     return assign_table
 
 
@@ -57,13 +60,13 @@ def read(workdir: str) -> pd.DataFrame:
     Returns:
         assign_table pandas.DataFrame
     """
-    return pd.read_csv(get_path(workdir))
+    return pd.read_parquet(get_path(workdir))
 
 
 def cache(workdir: str, assign_table: pd.DataFrame) -> None:
     """
-    Saves the pandas dataframe to a csv in the proper place in the project directory
-    A shortcut for pd.DataFrame.to_csv so you don't have to code it all the time
+    Saves the pandas dataframe to a parquet in the proper place in the project directory
+    A shortcut for pd.DataFrame.to_parquet so you don't have to code the paths every where
 
     Args:
         workdir: the project directory path
@@ -72,6 +75,6 @@ def cache(workdir: str, assign_table: pd.DataFrame) -> None:
     Returns:
         None
     """
-    assign_table.to_csv(get_path(workdir), index=False)
+    assign_table.to_parquet(get_path(workdir), compression='gzip')
     return
 

From 8b96df341e9955dcea551d8e5a047863c4a1a14c Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Thu, 18 Aug 2022 14:56:42 -0600
Subject: [PATCH 03/24] misc code clean up

---
 conda-environment.yml |  7 +++++-
 environment.yml       | 16 -------------
 saber/__init__.py     |  4 ++--
 saber/_vocab.py       |  5 ++--
 saber/analysis.py     |  1 -
 saber/cluster.py      | 53 ++++++++++++++++++++++++-------------------
 saber/gis.py          |  2 +-
 saber/prep.py         | 13 ++++-------
 setup.py              |  2 +-
 9 files changed, 47 insertions(+), 56 deletions(-)
 delete mode 100644 environment.yml

diff --git a/conda-environment.yml b/conda-environment.yml
index ff491bd..ec661ed 100644
--- a/conda-environment.yml
+++ b/conda-environment.yml
@@ -7,9 +7,14 @@ dependencies:
   - geopandas
   - pandas
   - numpy
+  - numba
   - contextily
-  - xarray
   - netCDF4
   - tslearn
   - pyarrow
   - kneed
+  - tslearn
+  - matplotlib
+  - geoglows
+  - requests
+  - scipy
diff --git a/environment.yml b/environment.yml
deleted file mode 100644
index f58ed11..0000000
--- a/environment.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-name: bias
-channels:
-  - conda-forge
-dependencies:
-  - python>=3
-  - pandas
-  - geopandas
-  - numpy
-  - xarray
-  - tslearn
-  - matplotlib
-  - xarray
-  - geoglows
-  - requests
-  - plotly
-  - scipy
\ No newline at end of file
diff --git a/saber/__init__.py b/saber/__init__.py
index 8d13a4f..3024c69 100644
--- a/saber/__init__.py
+++ b/saber/__init__.py
@@ -1,4 +1,4 @@
-from saber._workflow import prep_region, analyze_region
+from saber._workflow import analyze_region
 from saber._calibrate import calibrate, calibrate_region
 
 import saber.table
@@ -13,7 +13,7 @@
 import saber._vocab
 
 
-__all__ = ['prep_region', 'analyze_region',
+__all__ = ['analyze_region',
            'calibrate', 'calibrate_region',
            'table', 'prep', 'assign', 'gis', 'cluster', 'utils', 'validate', 'analysis']
 __author__ = 'Riley Hales'
diff --git a/saber/_vocab.py b/saber/_vocab.py
index db471f4..44c4f3c 100644
--- a/saber/_vocab.py
+++ b/saber/_vocab.py
@@ -40,12 +40,11 @@ def read_gauge_table(workdir: str) -> pd.DataFrame:
     return pd.read_parquet(os.path.join(workdir, tables, gauge_table))
 
 
-# todo: make everything in prep and other modules use this function for getting table paths
 def get_table_path(workdir: str, table_name: str) -> str:
     if table_name == 'hindcast_series':
-        return os.path.join(workdir, data_in, hindcast_table)
+        return os.path.join(workdir, tables, hindcast_table)
     elif table_name == 'hindcast_fdc':
-        return os.path.join(workdir, data_in, hindcast_fdc_table)
+        return os.path.join(workdir, tables, hindcast_fdc_table)
     elif table_name == 'drain_table':
         return os.path.join(workdir, tables, drain_table)
     elif table_name == 'gauge_table':
diff --git a/saber/analysis.py b/saber/analysis.py
index 56a0f40..582fa3d 100644
--- a/saber/analysis.py
+++ b/saber/analysis.py
@@ -1,6 +1,5 @@
 import os
 
-# import grids
 import pandas as pd
 import plotly.graph_objects as go
 
diff --git a/saber/cluster.py b/saber/cluster.py
index cf6f5b7..2bfb125 100755
--- a/saber/cluster.py
+++ b/saber/cluster.py
@@ -7,7 +7,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from tslearn.clustering import TimeSeriesKMeans
+from tslearn.clustering import KShape
 from tslearn.preprocessing import TimeSeriesScalerMeanVariance
 
 from ._vocab import cluster_count_file
@@ -17,7 +17,7 @@
 
 def generate(workdir: str) -> None:
     """
-    Trains DTW kmeans model pickle files and plots of the results saved as png images
+    Trains kmeans clustering models, saves the model as pickle, generates images and supplementary files
 
     Args:
         workdir: path to the project directory
@@ -27,51 +27,58 @@ def generate(workdir: str) -> None:
     """
     best_fit = {}
 
-    for table in glob.glob(os.path.join(workdir, 'data_processed', '*_fdc_*.parquet.gzip')):
+    for table in glob.glob(os.path.join(workdir, 'tables', '*_fdc_*.parquet.gzip')):
         # read the data and transform
         time_series = pd.read_parquet(table)
         time_series = np.transpose(time_series.values)
         time_series = TimeSeriesScalerMeanVariance().fit_transform(time_series)
 
         # get the name of the data and start tracking stats
-        dataset = os.path.splitext(os.path.basename(table))[0]
-        inertia = {'number': [], 'inertia': []}
+        dataset = os.path.basename(table).split('.')[0]
+        inertia = {'number': [], 'inertia': [], 'n_iter': []}
 
-        for num_cluster in range(2, 16):
+        for n_clusters in range(1, 17):
+            print(n_clusters)
             # build the kmeans model
-            km = TimeSeriesKMeans(n_clusters=num_cluster, random_state=0)
-            km.fit_predict(time_series)
-            inertia['number'].append(num_cluster)
-            inertia['inertia'].append(km.inertia_)
+            ks = KShape(n_clusters=n_clusters, random_state=0, max_iter=150)
+            ks.fit_predict(time_series)
+            inertia['number'].append(n_clusters)
+            inertia['inertia'].append(ks.inertia_)
+            inertia['n_iter'].append(ks.n_iter_)
 
             # save the trained model
-            km.to_pickle(os.path.join(workdir, 'kmeans_models', f'{dataset}-{num_cluster}-clusters-model.pickle'))
+            ks.to_pickle(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters-model.pickle'))
 
             # generate a plot of the clusters
             size = time_series.shape[1]
-            fig = plt.figure(figsize=(30, 15), dpi=450)
-            fig.suptitle("Dynamic Time Warping Clustering")
-            assigned_clusters = km.labels_
-            for i in range(num_cluster):
-                plt.subplot(2, math.ceil(num_cluster / 2), i + 1)
+            # up to 3 cols, rows determined by number of columns
+            n_cols = min(n_clusters, 3)
+            n_rows = math.ceil(n_clusters / n_cols)
+            img_size = 2.5
+            fig = plt.figure(figsize=(img_size * n_cols, img_size * n_rows), dpi=750)
+
+            fig.suptitle("KShape Clustering")
+            assigned_clusters = ks.labels_
+            for i in range(n_clusters):
+                plt.subplot(n_rows, n_cols, i + 1)
                 for j in time_series[assigned_clusters == i]:
                     plt.plot(j.ravel(), "k-", alpha=.2)
-                plt.plot(km.cluster_centers_[i].ravel(), "r-")
+                plt.plot(ks.cluster_centers_[i].ravel(), "r-")
                 plt.xlim(0, size)
-                plt.ylim(-3.5, 3.5)
-                plt.text(0.55, 0.85, f'Cluster {i}', transform=plt.gca().transAxes)
+                plt.ylim(-2, 4)
+                plt.text(0.55, 0.85, f'Cluster {i + 1}', transform=plt.gca().transAxes)
             plt.tight_layout()
-            fig.savefig(os.path.join(workdir, 'kmeans_images', f'{dataset}-{num_cluster}-clusters.png'))
+            fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters.png'))
             plt.close(fig)
 
         # save the inertia results as a csv
-        pd.DataFrame.from_dict(inertia).to_csv(os.path.join(workdir, 'kmeans_models', f'{dataset}-inertia.csv'))
+        pd.DataFrame.from_dict(inertia).to_csv(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-inertia.csv'))
         # find the knee/elbow
         knee = kneed.KneeLocator(inertia['number'], inertia['inertia'], curve='convex', direction='decreasing').knee
         best_fit[dataset] = int(knee)
 
     # save the best fitting cluster counts to a csv
-    with open(os.path.join(workdir, 'kmeans_models', cluster_count_file), 'w') as f:
+    with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'w') as f:
         f.write(json.dumps(best_fit))
     return
 
@@ -102,7 +109,7 @@ def summarize(workdir: str, assign_table: pd.DataFrame) -> pd.DataFrame:
 
         # open the optimal model pickle file
         optimal_model = os.path.join(workdir, 'kmeans_models', f'{dataset}-{cluster_count}-clusters-model.pickle')
-        sim_labels = TimeSeriesKMeans.from_pickle(optimal_model).labels_.tolist()
+        sim_labels = KShape.from_pickle(optimal_model).labels_.tolist()
 
         # create a dataframe of the ids and their labels (assigned groups)
         df = pd.DataFrame(np.transpose([sim_labels, ids]), columns=[f'{dataset}-cluster', merge_col])
diff --git a/saber/gis.py b/saber/gis.py
index 40a2759..78512f1 100644
--- a/saber/gis.py
+++ b/saber/gis.py
@@ -28,7 +28,7 @@ def generate_all(workdir: str, assign_table: pd.DataFrame, drain_shape: str, pre
         workdir: the path to the working directory for the project
         assign_table: the assign_table dataframe
         drain_shape: path to a drainage line shapefile which can be clipped
-        prefix: a prefix for names of the outputs to distinguish between data generated at separate instances
+        prefix: a prefix for names of the outputs to distinguish between data generated in separate instances
         id_column: name of the id column in the attributes of the shape table
 
     Returns:
diff --git a/saber/prep.py b/saber/prep.py
index 81478cd..1de954d 100755
--- a/saber/prep.py
+++ b/saber/prep.py
@@ -1,18 +1,14 @@
 import os
 
-import grids
 import pandas as pd
 import geopandas as gpd
 import netCDF4 as nc
 import numpy as np
 
-from .utils import compute_fdc
-
 from ._vocab import mid_col
-from ._vocab import hindcast_table
-from ._vocab import hindcast_fdc_table
 from ._vocab import read_drain_table
 from ._vocab import guess_hindcast_path
+from ._vocab import get_table_path
 
 __all__ = ['gis_tables', 'hindcast', 'scaffold_workdir']
 
@@ -70,14 +66,15 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None:
         columns=ids[ids_selector].astype(str),
         index=pd.to_datetime(hnc.variables['time'][:], unit='s')
     )
+    df = df[df.index.year >= 1980]
     df.index.name = 'datetime'
-    df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_series_table.parquet.gzip'), compression='gzip')
+    df.to_parquet(get_table_path(workdir, 'hindcast_series'), compression='gzip')
 
-    exceed_prob = np.linspace(0, 100, 401)[::-1]
+    exceed_prob = np.linspace(0, 100, 201)[::-1]
     df = df.apply(lambda x: np.transpose(np.nanpercentile(x, exceed_prob)))
     df.index = exceed_prob
     df.index.name = 'exceed_prob'
-    df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_table.parquet.gzip'), compression='gzip')
+    df.to_parquet(get_table_path(workdir, 'hindcast_fdc'), compression='gzip')
     return
 
 
diff --git a/setup.py b/setup.py
index 4949ea8..e04a608 100644
--- a/setup.py
+++ b/setup.py
@@ -32,4 +32,4 @@
     ],
     python_requires=">=3",
     install_requires=install_requires
-)
\ No newline at end of file
+)

From 98c0e18e8d38ba5e423463973ea86af9084dea1c Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Thu, 18 Aug 2022 14:57:26 -0600
Subject: [PATCH 04/24] increment version number

---
 saber/__init__.py | 2 +-
 setup.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/saber/__init__.py b/saber/__init__.py
index 3024c69..0303a83 100644
--- a/saber/__init__.py
+++ b/saber/__init__.py
@@ -17,5 +17,5 @@
            'calibrate', 'calibrate_region',
            'table', 'prep', 'assign', 'gis', 'cluster', 'utils', 'validate', 'analysis']
 __author__ = 'Riley Hales'
-__version__ = '0.3.0'
+__version__ = '0.4.0'
 __license__ = 'BSD 3 Clause Clear'
diff --git a/setup.py b/setup.py
index e04a608..6ef3807 100644
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@
     install_requires = req.read().splitlines()
 
 description = 'tools for hydrological bias correction on large models'
-version = '0.3.0'
+version = '0.4.0'
 
 setup(
     name='saber',

From b3d028a56d010f9451b015db6cf3dfe24066284a Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Thu, 18 Aug 2022 15:00:12 -0600
Subject: [PATCH 05/24] rename distribution saberbc

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 6ef3807..546cf3b 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
 version = '0.4.0'
 
 setup(
-    name='saber',
+    name='saberbc',
     packages=['saber'],
     version=version,
     description=description,

From 61c41f3fd2abcf9805018b156d0b9ab11f4c122a Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Sat, 20 Aug 2022 21:20:04 -0600
Subject: [PATCH 06/24] partial commit

---
 README.md        |  76 +++++++++++++----------------
 requirements.txt |  19 ++++----
 saber/cluster.py | 122 +++++++++++++++++++++++------------------------
 saber/prep.py    |  20 ++++++--
 setup.py         |   2 +-
 5 files changed, 119 insertions(+), 120 deletions(-)

diff --git a/README.md b/README.md
index dfde9bd..9951d5f 100755
--- a/README.md
+++ b/README.md
@@ -1,69 +1,57 @@
 # Stream Analysis for Bias Estimation and Reduction
 
 ## Theory
-Basins and streams will be used interchangeably to refer to the specific stream subunit.
-
-Large scale hydrologic models are generally not strongly physically based because of the extreme amounts of input data, 
-computing power, and computing time that would be required to run it. The model uses the results of other physically based 
-models. This is advantageous to us because we rely on the land surface model's behavior of mapping basins with similar 
-hydrologically important geophysical physical characteristics to similar discharge forecasts.
-
-This method relies on spatial analysis, machine learning, and statistical refinements. This is a post processing step which 
-makes it applicable to many models when the source data, code, or other inputs make the model inaccessible or impractical 
-to tweak. We do this in an attempt to avoid requiring the source model data or the ability to run the model. 
-Both of those conditions are not always available or practical when dealing with large scale models or datasets.
+Large scale hydrological models are often biased despite the best calibration efforts. Bias correction is a pre- or 
+post-processing step applied to the model's inputs or outputs to empirically alter datasets and improve model performance. 
+SABER is a bias correction post processor for discharge predictions. SABER builds on frequency matching ideas used in several 
+other methods that match biased predictions with observed data with corresponding exceedance probabilities; that is, 
+using the flow duration curve. SABER expands this approach to ungauged stream reaches using spatial analysis, clustering 
+(machine learning), and statistical refinements and a new concept- the scalar flow duration curve. 
 
 ## Python environment
 See requirements.txt
-- python >= 3.7
-- numpy
-- pandas
-- geopandas (https://github.com/geopandas/geopandas)
-- scikit-learn
-- tslearn (https://github.com/tslearn-team/tslearn)
 
 ## Required Data/Inputs 
-You need the following data to follow this procedure. The instructions list Shapefile but other vector spatial data 
-file formats are acceptable
-1. Shapefiles for the Drainage Lines (lines) and Catchments (polygons) in the watershed as used by the hydrological model. 
-   The attribute table should contain (at least) the following entries for each feature:
-    - (Both) An identifier number
-    - (Both) The ID of the next segment/catchment downstream
-    - (Drainage lines, preferably both) The stream order of each segment
-    - (Catchments, preferably both) Cumulative upstream drainage area
-    - (Drainage lines) the x coordinate of the centroid of each feature
-    - (Drainage lines) the y coordinate of the centroid of each feature
-2. Shapefile of points representing the location of each of the river gauging stations available.
-   The attribute table should contain (at least) the following entries for each point
-    - A unique ID number or name assigned to the gauge, preferably numeric. Randomly generate unique numeric ID's if they don't exist.
+You need the following data to follow this procedure. Geopackage format GIS data may be substituted with Shapefile or 
+equivalent formats that can be read by `geopandas`.
+1. Geopackage drainage Lines (usually delineated center lines) and catchments/subbasins (polygons) in the watershed. The 
+   attribute tables for both should contain (at least) the following entries for each feature:
+    - An identifier column (alphanumeric) labeled `model_id`.
+    - The ID of the next downstream reach/subbasin
+    - The stream order of each reach/subbasin
+    - Cumulative upstream drainage area
+    - The x coordinate of the centroid of each feature (precalculated for faster results later)
+    - The y coordinate of the centroid of each feature (precalculated for faster results later)
+2. Geopackage points representing the location of each of the river gauging stations available. The attribute table 
+   should contain (at least) the following entries for each point
+    - A unique ID number or name assigned to the gauge, preferably short alphanumeric. You can randomly generate them if convenient
     - The ID of the stream segment in the model which corresponds to that gauge.
-3. Shapefile of the boundary (polygon) of the target region for bias calibration.
-4. The Shapefiles for the Drainage Lines, Catchments, Gauges/Stations, and boundary are all in the same projection.
-5. Historical simulated discharge for each stream segment and for as long (temporally) as is available.
-6. Observed discharge data for as many stream reaches as possible within the target region.
-7. The units of the simulation and observation data must be in the same units.
-8. A working directory on the computer where the scripts are going to be run.
+3. Hindcast/historical simulation discharge for each stream reach
+4. Observed discharge data for each gauge
+
+Things to check when preparing your data
+1. The units of the simulated and observed data are in the same units
+2. The GIS datasets are all in the same projection.
+3. The GIS datasets should only contain gauges and reaches/subbasins within the area of interest. Clip/delete anything
+   else. You might find it helpful to keep a watershed boundary geopackage.
 
 ## Process
 ### 1 Create a Working Directory
 
 ```python
-import saber as saber
+import saber
 
-path_to_working_directory = '/my/file/path'
-saber.prep.scaffold_workdir(path_to_working_directory)
+workdir = '/my/file/path'
+saber.prep.scaffold_workdir(workdir)
 ```
 
 Your working directory should exactly like this. 
 ```
 working_directory
-    kmeans_models/
-    kmeans_images/
     data_inputs/
-    data_processed/
-    gis_inputs/
+    tables/
+    kmeans_outputs/
     gis_outputs/
-    validation_sets/
 ```
 
 ### 2 Prepare Spatial Data (scripts not provided)
diff --git a/requirements.txt b/requirements.txt
index 4c71968..b833ca8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,17 +1,14 @@
-pandas
+python=3
 geopandas
+pandas
 numpy
-xarray
+numba
+contextily
 netCDF4
 tslearn
+pyarrow
+kneed
+tslearn
 matplotlib
-geoglows
-grids
 requests
-plotly
-h5py
-scipy
-hydrostats
-kneed
-hydrostats
-matplotlib
\ No newline at end of file
+scipy
\ No newline at end of file
diff --git a/saber/cluster.py b/saber/cluster.py
index 2bfb125..0836748 100755
--- a/saber/cluster.py
+++ b/saber/cluster.py
@@ -8,7 +8,6 @@
 import numpy as np
 import pandas as pd
 from tslearn.clustering import KShape
-from tslearn.preprocessing import TimeSeriesScalerMeanVariance
 
 from ._vocab import cluster_count_file
 from ._vocab import mid_col
@@ -25,82 +24,83 @@ def generate(workdir: str) -> None:
     Returns:
         None
     """
-    best_fit = {}
-
-    for table in glob.glob(os.path.join(workdir, 'tables', '*_fdc_*.parquet.gzip')):
-        # read the data and transform
-        time_series = pd.read_parquet(table)
-        time_series = np.transpose(time_series.values)
-        time_series = TimeSeriesScalerMeanVariance().fit_transform(time_series)
-
-        # get the name of the data and start tracking stats
-        dataset = os.path.basename(table).split('.')[0]
-        inertia = {'number': [], 'inertia': [], 'n_iter': []}
-
-        for n_clusters in range(1, 17):
-            print(n_clusters)
-            # build the kmeans model
-            ks = KShape(n_clusters=n_clusters, random_state=0, max_iter=150)
-            ks.fit_predict(time_series)
-            inertia['number'].append(n_clusters)
-            inertia['inertia'].append(ks.inertia_)
-            inertia['n_iter'].append(ks.n_iter_)
-
-            # save the trained model
-            ks.to_pickle(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters-model.pickle'))
-
-            # generate a plot of the clusters
-            size = time_series.shape[1]
-            # up to 3 cols, rows determined by number of columns
-            n_cols = min(n_clusters, 3)
-            n_rows = math.ceil(n_clusters / n_cols)
-            img_size = 2.5
-            fig = plt.figure(figsize=(img_size * n_cols, img_size * n_rows), dpi=750)
-
-            fig.suptitle("KShape Clustering")
-            assigned_clusters = ks.labels_
-            for i in range(n_clusters):
-                plt.subplot(n_rows, n_cols, i + 1)
-                for j in time_series[assigned_clusters == i]:
-                    plt.plot(j.ravel(), "k-", alpha=.2)
-                plt.plot(ks.cluster_centers_[i].ravel(), "r-")
-                plt.xlim(0, size)
-                plt.ylim(-2, 4)
-                plt.text(0.55, 0.85, f'Cluster {i + 1}', transform=plt.gca().transAxes)
-            plt.tight_layout()
-            fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters.png'))
-            plt.close(fig)
-
-        # save the inertia results as a csv
-        pd.DataFrame.from_dict(inertia).to_csv(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-inertia.csv'))
-        # find the knee/elbow
-        knee = kneed.KneeLocator(inertia['number'], inertia['inertia'], curve='convex', direction='decreasing').knee
-        best_fit[dataset] = int(knee)
-
-    # save the best fitting cluster counts to a csv
-    with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'w') as f:
-        f.write(json.dumps(best_fit))
+    labels = []
+    inertia = {'number': [], 'inertia': [], 'n_iter': []}
+
+    # read the prepared data (array x)
+    x = pd.read_parquet(os.path.join(workdir, 'tables', ''))
+
+    for n_clusters in range(1, 17):
+        print(n_clusters)
+        # build the kmeans model
+        ks = KShape(n_clusters=n_clusters, random_state=0, max_iter=150)
+        ks.fit_predict(x)
+        inertia['number'].append(n_clusters)
+        inertia['inertia'].append(ks.inertia_)
+        inertia['n_iter'].append(ks.n_iter_)
+        labels.append(ks.labels_)
+
+    # save the inertia results as a csv
+    pd.DataFrame.from_dict(inertia).to_parquet(
+        os.path.join(workdir, 'kmeans_outputs', f'{dataset}-inertia.parquet.gzip'), compression='gzip')
+
+    df = pd.DataFrame(labels, columns=pd.read_parquet(
+        os.path.join(workdir, 'tables', 'model_ids.parquet.gzip')).values.flatten())
+    df['count'] = df.nunique(axis=1)
+    df = df.sort_values('count')
+    df = df.set_index('count')
+    df.to_parquet(os.path.join(workdir, 'kmeans_outputs', 'cluster_num_summary.parquet.gzip'))
+
+    # # find the knee/elbow
+    # knee = kneed.KneeLocator(inertia['number'], inertia['inertia'], curve='convex', direction='decreasing').knee
+    # best_fit[dataset] = int(knee)
+
+    # # save the best fitting cluster counts to a csv
+    # with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'w') as f:
+    #     f.write(json.dumps(best_fit))
     return
 
 
-def summarize(workdir: str, assign_table: pd.DataFrame) -> pd.DataFrame:
+def plot(workdir: str):
+    # generate a plot of the clusters
+    size = time_series.shape[1]
+    # up to 3 cols, rows determined by number of columns
+    n_cols = min(n_clusters, 3)
+    n_rows = math.ceil(n_clusters / n_cols)
+    img_size = 2.5
+    fig = plt.figure(figsize=(img_size * n_cols, img_size * n_rows), dpi=750)
+
+    fig.suptitle("KShape Clustering")
+    assigned_clusters = ks.labels_
+    for i in range(n_clusters):
+        plt.subplot(n_rows, n_cols, i + 1)
+        for j in time_series[assigned_clusters == i]:
+            plt.plot(j.ravel(), "k-", alpha=.2)
+        plt.plot(ks.cluster_centers_[i].ravel(), "r-")
+        plt.xlim(0, size)
+        plt.ylim(-2, 4)
+        plt.text(0.55, 0.85, f'Cluster {i + 1}', transform=plt.gca().transAxes)
+    plt.tight_layout()
+    fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters.png'))
+    plt.close(fig)
+    return
+
+
+def summarize(workdir: str) -> pd.DataFrame:
     """
     Creates a csv listing the streams assigned to each cluster in workdir/kmeans_models and also adds that information
     to assign_table.csv
 
     Args:
         workdir: path to the project directory
-        assign_table: the assign_table dataframe
 
     Returns:
         None
     """
     # read the cluster results csv
-    with open(os.path.join(workdir, 'kmeans_models', cluster_count_file), 'r') as f:
+    with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'r') as f:
         clusters = json.loads(f.read())
 
-    assign_table[mid_col] = assign_table[mid_col].astype(int)
-
     for dataset, cluster_count in clusters.items():
         # read the list of simulated id's, pair them with their cluster label, save to df
         merge_col = mid_col if "sim" in dataset else gid_col
diff --git a/saber/prep.py b/saber/prep.py
index 1de954d..56df8d7 100755
--- a/saber/prep.py
+++ b/saber/prep.py
@@ -4,6 +4,7 @@
 import geopandas as gpd
 import netCDF4 as nc
 import numpy as np
+from tslearn.preprocessing import TimeSeriesScalerMeanVariance
 
 from ._vocab import mid_col
 from ._vocab import read_drain_table
@@ -27,7 +28,6 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No
     """
     if gauge_gis is not None:
         gdf = gpd.read_file(gauge_gis).drop('geometry', axis=1)
-        gdf[['x', 'y']] = gdf.geometry.apply(lambda x: [x.x, x.y])
         pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet(
             os.path.join(workdir, 'tables', 'gauge_table.parquet.gzip'), compression='gzip')
     if drain_gis is not None:
@@ -57,24 +57,38 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None:
     drain_table = read_drain_table(workdir)
     model_ids = list(set(sorted(drain_table[mid_col].tolist())))
 
-    # read the hindcast netcdf and convert to dataframe
+    # read the hindcast netcdf, convert to dataframe, store as parquet
     hnc = nc.Dataset(hind_nc_path)
     ids = pd.Series(hnc['rivid'][:])
     ids_selector = ids.isin(model_ids)
+    ids = ids[ids_selector].astype(str).values.flatten()
+
+    # save the model ids to table for reference
+    pd.DataFrame(ids, columns=['model_id', ]).to_parquet(
+        os.path.join(workdir, 'tables', 'model_ids.parquet.gzip'), compression='gzip')
+
+    # save the hindcast series to parquet
     df = pd.DataFrame(
         hnc['Qout'][:, ids_selector],
-        columns=ids[ids_selector].astype(str),
+        columns=ids,
         index=pd.to_datetime(hnc.variables['time'][:], unit='s')
     )
     df = df[df.index.year >= 1980]
     df.index.name = 'datetime'
     df.to_parquet(get_table_path(workdir, 'hindcast_series'), compression='gzip')
 
+    # calculate the FDC and save to parquet
     exceed_prob = np.linspace(0, 100, 201)[::-1]
     df = df.apply(lambda x: np.transpose(np.nanpercentile(x, exceed_prob)))
     df.index = exceed_prob
     df.index.name = 'exceed_prob'
     df.to_parquet(get_table_path(workdir, 'hindcast_fdc'), compression='gzip')
+
+    # transform and prepare for clustering
+    df = pd.DataFrame(TimeSeriesScalerMeanVariance().fit_transform(np.transpose(df.values)))
+    df.index = ids
+    df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet.gzip'), compression='gzip')
+
     return
 
 
diff --git a/setup.py b/setup.py
index 546cf3b..6ef3807 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
 version = '0.4.0'
 
 setup(
-    name='saberbc',
+    name='saber',
     packages=['saber'],
     version=version,
     description=description,

From f62ee080dac1e10e61dd89bdface98d8044b9d41 Mon Sep 17 00:00:00 2001
From: Riley Hales <rileyhales1@gmail.com>
Date: Sat, 20 Aug 2022 22:09:56 -0600
Subject: [PATCH 07/24] split clustering functions

---
 saber/cluster.py | 88 +++++++++++++++++++++---------------------------
 saber/prep.py    |  2 +-
 2 files changed, 40 insertions(+), 50 deletions(-)

diff --git a/saber/cluster.py b/saber/cluster.py
index 0836748..4827c8b 100755
--- a/saber/cluster.py
+++ b/saber/cluster.py
@@ -1,4 +1,3 @@
-import glob
 import math
 import os
 import json
@@ -7,7 +6,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from tslearn.clustering import KShape
+from tslearn.clustering import TimeSeriesKMeans
 
 from ._vocab import cluster_count_file
 from ._vocab import mid_col
@@ -24,65 +23,29 @@ def generate(workdir: str) -> None:
     Returns:
         None
     """
-    labels = []
     inertia = {'number': [], 'inertia': [], 'n_iter': []}
 
     # read the prepared data (array x)
-    x = pd.read_parquet(os.path.join(workdir, 'tables', ''))
+    x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet.gzip'))
 
+    # build the kmeans model for a range of cluster numbers
     for n_clusters in range(1, 17):
-        print(n_clusters)
-        # build the kmeans model
-        ks = KShape(n_clusters=n_clusters, random_state=0, max_iter=150)
+        ks = TimeSeriesKMeans(n_clusters=n_clusters, max_iter=150)
         ks.fit_predict(x)
+        ks.to_pickle(os.path.join(workdir, 'kmeans_outputs', f'kshape-{n_clusters}.pickle'))
         inertia['number'].append(n_clusters)
         inertia['inertia'].append(ks.inertia_)
         inertia['n_iter'].append(ks.n_iter_)
-        labels.append(ks.labels_)
 
     # save the inertia results as a csv
-    pd.DataFrame.from_dict(inertia).to_parquet(
-        os.path.join(workdir, 'kmeans_outputs', f'{dataset}-inertia.parquet.gzip'), compression='gzip')
-
-    df = pd.DataFrame(labels, columns=pd.read_parquet(
-        os.path.join(workdir, 'tables', 'model_ids.parquet.gzip')).values.flatten())
-    df['count'] = df.nunique(axis=1)
-    df = df.sort_values('count')
-    df = df.set_index('count')
-    df.to_parquet(os.path.join(workdir, 'kmeans_outputs', 'cluster_num_summary.parquet.gzip'))
-
-    # # find the knee/elbow
-    # knee = kneed.KneeLocator(inertia['number'], inertia['inertia'], curve='convex', direction='decreasing').knee
-    # best_fit[dataset] = int(knee)
-
-    # # save the best fitting cluster counts to a csv
-    # with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'w') as f:
-    #     f.write(json.dumps(best_fit))
-    return
-
+    pd.DataFrame.from_dict(inertia).to_parquet(os.path.join(workdir, 'kmeans_outputs', f'cluster-inertia.csv'))
 
-def plot(workdir: str):
-    # generate a plot of the clusters
-    size = time_series.shape[1]
-    # up to 3 cols, rows determined by number of columns
-    n_cols = min(n_clusters, 3)
-    n_rows = math.ceil(n_clusters / n_cols)
-    img_size = 2.5
-    fig = plt.figure(figsize=(img_size * n_cols, img_size * n_rows), dpi=750)
+    # find the knee/elbow
+    knee = kneed.KneeLocator(inertia['number'], inertia['inertia'], curve='convex', direction='decreasing').knee
 
-    fig.suptitle("KShape Clustering")
-    assigned_clusters = ks.labels_
-    for i in range(n_clusters):
-        plt.subplot(n_rows, n_cols, i + 1)
-        for j in time_series[assigned_clusters == i]:
-            plt.plot(j.ravel(), "k-", alpha=.2)
-        plt.plot(ks.cluster_centers_[i].ravel(), "r-")
-        plt.xlim(0, size)
-        plt.ylim(-2, 4)
-        plt.text(0.55, 0.85, f'Cluster {i + 1}', transform=plt.gca().transAxes)
-    plt.tight_layout()
-    fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters.png'))
-    plt.close(fig)
+    # save the best fitting cluster counts to a csv
+    with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'w') as f:
+        f.write(json.dumps({'historical': int(knee)}))
     return
 
 
@@ -109,7 +72,7 @@ def summarize(workdir: str) -> pd.DataFrame:
 
         # open the optimal model pickle file
         optimal_model = os.path.join(workdir, 'kmeans_models', f'{dataset}-{cluster_count}-clusters-model.pickle')
-        sim_labels = KShape.from_pickle(optimal_model).labels_.tolist()
+        sim_labels = TimeSeriesKMeans.from_pickle(optimal_model).labels_.tolist()
 
         # create a dataframe of the ids and their labels (assigned groups)
         df = pd.DataFrame(np.transpose([sim_labels, ids]), columns=[f'{dataset}-cluster', merge_col])
@@ -120,3 +83,30 @@ def summarize(workdir: str) -> pd.DataFrame:
         assign_table = assign_table.merge(df, how='outer', on=merge_col)
 
     return assign_table
+
+
+def plot(workdir: str):
+    # read the fdc's
+    x = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs'))
+    # generate a plot of the clusters
+    size = time_series.shape[1]
+    # up to 3 cols, rows determined by number of columns
+    n_cols = min(n_clusters, 3)
+    n_rows = math.ceil(n_clusters / n_cols)
+    img_size = 2.5
+    fig = plt.figure(figsize=(img_size * n_cols, img_size * n_rows), dpi=750)
+
+    fig.suptitle("TimeSeriesKMeans Clustering")
+    assigned_clusters = ks.labels_
+    for i in range(n_clusters):
+        plt.subplot(n_rows, n_cols, i + 1)
+        for j in time_series[assigned_clusters == i]:
+            plt.plot(j.ravel(), "k-", alpha=.2)
+        plt.plot(ks.cluster_centers_[i].ravel(), "r-")
+        plt.xlim(0, size)
+        plt.ylim(-2, 4)
+        plt.text(0.55, 0.85, f'Cluster {i + 1}', transform=plt.gca().transAxes)
+    plt.tight_layout()
+    fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters.png'))
+    plt.close(fig)
+    return
\ No newline at end of file
diff --git a/saber/prep.py b/saber/prep.py
index 56df8d7..04882c5 100755
--- a/saber/prep.py
+++ b/saber/prep.py
@@ -32,7 +32,7 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No
             os.path.join(workdir, 'tables', 'gauge_table.parquet.gzip'), compression='gzip')
     if drain_gis is not None:
         gdf = gpd.read_file(drain_gis).drop('geometry', axis=1)
-        gdf[['x', 'y']] = gdf.geometry.apply(lambda x: [x.x, x.y])
+        gdf[['centroid_x', 'centroid_y']] = gdf.geometry.apply(lambda x: [x.x, x.y])
         pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet(
             os.path.join(workdir, 'tables', 'drain_table.parquet.gzip'), compression='gzip')
     return

From ac3392fc869b701b009dd1c063499bb00672942b Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Sun, 21 Aug 2022 15:40:04 -0600
Subject: [PATCH 08/24] plot cluster figures in separate function

---
 conda-environment.yml |   1 +
 requirements.txt      |   3 +-
 saber/_vocab.py       |   8 +--
 saber/cluster.py      | 110 +++++++++++++++++++++++++++---------------
 saber/prep.py         |  24 ++++-----
 5 files changed, 91 insertions(+), 55 deletions(-)

diff --git a/conda-environment.yml b/conda-environment.yml
index ec661ed..2a9783c 100644
--- a/conda-environment.yml
+++ b/conda-environment.yml
@@ -18,3 +18,4 @@ dependencies:
   - geoglows
   - requests
   - scipy
+  - natsort
diff --git a/requirements.txt b/requirements.txt
index b833ca8..99ea19d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,4 +11,5 @@ kneed
 tslearn
 matplotlib
 requests
-scipy
\ No newline at end of file
+scipy
+natsort
\ No newline at end of file
diff --git a/saber/_vocab.py b/saber/_vocab.py
index 44c4f3c..923c3f1 100644
--- a/saber/_vocab.py
+++ b/saber/_vocab.py
@@ -22,10 +22,10 @@
 # name of some files produced by the algorithm
 cluster_count_file = 'best-fit-cluster-count.json'
 cal_nc_name = 'calibrated_simulated_flow.nc'
-hindcast_table = 'hindcast_series_table.parquet.gzip'
-hindcast_fdc_table = 'hindcast_fdc_table.parquet.gzip'
-drain_table = 'drain_table.parquet.gzip'
-gauge_table = 'gauge_table.parquet.gzip'
+hindcast_table = 'hindcast_series_table.parquet'
+hindcast_fdc_table = 'hindcast_fdc_table.parquet'
+drain_table = 'drain_table.parquet'
+gauge_table = 'gauge_table.parquet'
 
 # metrics computed on validation sets
 metric_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE (2012)']
diff --git a/saber/cluster.py b/saber/cluster.py
index 4827c8b..a3e1582 100755
--- a/saber/cluster.py
+++ b/saber/cluster.py
@@ -1,12 +1,14 @@
 import math
 import os
 import json
+import glob
 
 import kneed
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from tslearn.clustering import TimeSeriesKMeans
+from tslearn.clustering import TimeSeriesKMeans as Cluster
+from natsort import natsorted
 
 from ._vocab import cluster_count_file
 from ._vocab import mid_col
@@ -26,19 +28,21 @@ def generate(workdir: str) -> None:
     inertia = {'number': [], 'inertia': [], 'n_iter': []}
 
     # read the prepared data (array x)
-    x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet.gzip'))
+    x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet'))
+    x = x.values
 
     # build the kmeans model for a range of cluster numbers
     for n_clusters in range(1, 17):
-        ks = TimeSeriesKMeans(n_clusters=n_clusters, max_iter=150)
+        print(n_clusters)
+        ks = Cluster(n_clusters=n_clusters, max_iter=150)
         ks.fit_predict(x)
-        ks.to_pickle(os.path.join(workdir, 'kmeans_outputs', f'kshape-{n_clusters}.pickle'))
+        ks.to_pickle(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle'))
         inertia['number'].append(n_clusters)
         inertia['inertia'].append(ks.inertia_)
         inertia['n_iter'].append(ks.n_iter_)
 
     # save the inertia results as a csv
-    pd.DataFrame.from_dict(inertia).to_parquet(os.path.join(workdir, 'kmeans_outputs', f'cluster-inertia.csv'))
+    pd.DataFrame.from_dict(inertia).to_csv(os.path.join(workdir, 'kmeans_outputs', f'cluster-inertia.csv'))
 
     # find the knee/elbow
     knee = kneed.KneeLocator(inertia['number'], inertia['inertia'], curve='convex', direction='decreasing').knee
@@ -49,21 +53,78 @@ def generate(workdir: str) -> None:
     return
 
 
-def summarize(workdir: str) -> pd.DataFrame:
+def plot(workdir: str) -> None:
+    """
+    Generate figures of the clustered FDC's
+
+    Args:
+        workdir: path to the project directory
+
+    Returns:
+        None
+    """
+    # image generating params
+    img_width = 3
+    img_height = 3
+    max_cols = 3
+
+    # read the fdc's
+    x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')).values
+    size = x.shape[1]
+    x_values = np.linspace(0, size, 5)
+    x_ticks = np.linspace(0, 100, 5).astype(int)
+
+    for model_pickle in natsorted(glob.glob(os.path.join(workdir, 'kmeans_outputs', 'kmeans-*.pickle'))):
+        kmeans = Cluster.from_pickle(model_pickle)
+        n_clusters = int(kmeans.n_clusters)
+        n_cols = min(n_clusters, max_cols)
+        n_rows = math.ceil(n_clusters / n_cols)
+
+        fig, axs = plt.subplots(n_rows, n_cols, figsize=(img_width * n_cols + 1, img_height * n_rows + 1), dpi=800,
+                                squeeze=False, tight_layout=True, sharey=True)
+        fig.suptitle("KMeans FDC Clustering")
+        fig.supxlabel('Exceedance Probability (%)')
+        fig.supylabel('Discharge Z-Score')
+
+        for i, ax in enumerate(fig.axes[:n_clusters]):
+            ax.set_title(f'Cluster {i + 1}')
+            ax.set_xlim(0, size)
+            ax.set_xticks(x_values, x_ticks)
+            ax.set_ylim(-2, 4)
+            for j in x[kmeans.labels_ == i]:
+                ax.plot(j.ravel(), "k-", alpha=.15)
+            ax.plot(kmeans.cluster_centers_[i].flatten(), "r-")
+        # turn off plotting axes which are blank - made for the square grid but > n_clusters
+        for ax in fig.axes[n_clusters:]:
+            ax.axis('off')
+
+        fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.png'))
+        plt.close(fig)
+    return
+
+
+def summarize(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) -> pd.DataFrame:
     """
     Creates a csv listing the streams assigned to each cluster in workdir/kmeans_models and also adds that information
     to assign_table.csv
 
     Args:
         workdir: path to the project directory
+        assign_table: the assignment table DataFrame
+        n_clusters: number of clusters to use when applying the labels to the assign_table
 
     Returns:
         None
     """
-    # read the cluster results csv
-    with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'r') as f:
-        clusters = json.loads(f.read())
-
+    if n_clusters is None:
+        # read the cluster results csv
+        with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'r') as f:
+            n_clusters = int(json.loads(f.read())['historical'])
+
+    pd.DataFrame({
+        'cluster': Cluster.from_pickle(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_.flatten(),
+        'model_id': pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten()
+    })
     for dataset, cluster_count in clusters.items():
         # read the list of simulated id's, pair them with their cluster label, save to df
         merge_col = mid_col if "sim" in dataset else gid_col
@@ -72,7 +133,7 @@ def summarize(workdir: str) -> pd.DataFrame:
 
         # open the optimal model pickle file
         optimal_model = os.path.join(workdir, 'kmeans_models', f'{dataset}-{cluster_count}-clusters-model.pickle')
-        sim_labels = TimeSeriesKMeans.from_pickle(optimal_model).labels_.tolist()
+        sim_labels = Cluster.from_pickle(optimal_model).labels_.tolist()
 
         # create a dataframe of the ids and their labels (assigned groups)
         df = pd.DataFrame(np.transpose([sim_labels, ids]), columns=[f'{dataset}-cluster', merge_col])
@@ -83,30 +144,3 @@ def summarize(workdir: str) -> pd.DataFrame:
         assign_table = assign_table.merge(df, how='outer', on=merge_col)
 
     return assign_table
-
-
-def plot(workdir: str):
-    # read the fdc's
-    x = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs'))
-    # generate a plot of the clusters
-    size = time_series.shape[1]
-    # up to 3 cols, rows determined by number of columns
-    n_cols = min(n_clusters, 3)
-    n_rows = math.ceil(n_clusters / n_cols)
-    img_size = 2.5
-    fig = plt.figure(figsize=(img_size * n_cols, img_size * n_rows), dpi=750)
-
-    fig.suptitle("TimeSeriesKMeans Clustering")
-    assigned_clusters = ks.labels_
-    for i in range(n_clusters):
-        plt.subplot(n_rows, n_cols, i + 1)
-        for j in time_series[assigned_clusters == i]:
-            plt.plot(j.ravel(), "k-", alpha=.2)
-        plt.plot(ks.cluster_centers_[i].ravel(), "r-")
-        plt.xlim(0, size)
-        plt.ylim(-2, 4)
-        plt.text(0.55, 0.85, f'Cluster {i + 1}', transform=plt.gca().transAxes)
-    plt.tight_layout()
-    fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters.png'))
-    plt.close(fig)
-    return
\ No newline at end of file
diff --git a/saber/prep.py b/saber/prep.py
index 04882c5..0890bd3 100755
--- a/saber/prep.py
+++ b/saber/prep.py
@@ -29,12 +29,13 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No
     if gauge_gis is not None:
         gdf = gpd.read_file(gauge_gis).drop('geometry', axis=1)
         pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet(
-            os.path.join(workdir, 'tables', 'gauge_table.parquet.gzip'), compression='gzip')
+            os.path.join(workdir, 'tables', 'gauge_table.parquet'))
     if drain_gis is not None:
-        gdf = gpd.read_file(drain_gis).drop('geometry', axis=1)
-        gdf[['centroid_x', 'centroid_y']] = gdf.geometry.apply(lambda x: [x.x, x.y])
-        pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet(
-            os.path.join(workdir, 'tables', 'drain_table.parquet.gzip'), compression='gzip')
+        gdf = gpd.read_file(drain_gis)
+        gdf['centroid_x'] = gdf.geometry.centroid.x
+        gdf['centroid_y'] = gdf.geometry.centroid.y
+        gdf = gdf.drop('geometry', axis=1)
+        pd.DataFrame(gdf).to_parquet(os.path.join(workdir, 'tables', 'drain_table.parquet'))
     return
 
 
@@ -64,8 +65,7 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None:
     ids = ids[ids_selector].astype(str).values.flatten()
 
     # save the model ids to table for reference
-    pd.DataFrame(ids, columns=['model_id', ]).to_parquet(
-        os.path.join(workdir, 'tables', 'model_ids.parquet.gzip'), compression='gzip')
+    pd.DataFrame(ids, columns=['model_id', ]).to_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet'))
 
     # save the hindcast series to parquet
     df = pd.DataFrame(
@@ -75,20 +75,20 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None:
     )
     df = df[df.index.year >= 1980]
     df.index.name = 'datetime'
-    df.to_parquet(get_table_path(workdir, 'hindcast_series'), compression='gzip')
+    df.to_parquet(get_table_path(workdir, 'hindcast_series'))
 
     # calculate the FDC and save to parquet
     exceed_prob = np.linspace(0, 100, 201)[::-1]
     df = df.apply(lambda x: np.transpose(np.nanpercentile(x, exceed_prob)))
     df.index = exceed_prob
     df.index.name = 'exceed_prob'
-    df.to_parquet(get_table_path(workdir, 'hindcast_fdc'), compression='gzip')
+    df.to_parquet(get_table_path(workdir, 'hindcast_fdc'))
 
     # transform and prepare for clustering
-    df = pd.DataFrame(TimeSeriesScalerMeanVariance().fit_transform(np.transpose(df.values)))
+    df = pd.DataFrame(np.squeeze(TimeSeriesScalerMeanVariance().fit_transform(np.squeeze(np.transpose(df.values)))))
     df.index = ids
-    df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet.gzip'), compression='gzip')
-
+    df.columns = df.columns.astype(str)
+    df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet'))
     return
 
 

From 8b60508a2534f94c5a78bda9c753befe0ab6e7af Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Tue, 23 Aug 2022 20:39:23 -0600
Subject: [PATCH 09/24] remove gauge clustering

---
 saber/__init__.py  |  4 +---
 saber/_workflow.py | 43 -------------------------------------------
 saber/assign.py    | 28 ++++------------------------
 saber/cluster.py   | 30 +++++++-----------------------
 saber/gis.py       | 40 ++++++++++++++--------------------------
 5 files changed, 26 insertions(+), 119 deletions(-)
 delete mode 100644 saber/_workflow.py

diff --git a/saber/__init__.py b/saber/__init__.py
index 0303a83..d6a5a09 100644
--- a/saber/__init__.py
+++ b/saber/__init__.py
@@ -1,4 +1,3 @@
-from saber._workflow import analyze_region
 from saber._calibrate import calibrate, calibrate_region
 
 import saber.table
@@ -13,8 +12,7 @@
 import saber._vocab
 
 
-__all__ = ['analyze_region',
-           'calibrate', 'calibrate_region',
+__all__ = ['calibrate', 'calibrate_region',
            'table', 'prep', 'assign', 'gis', 'cluster', 'utils', 'validate', 'analysis']
 __author__ = 'Riley Hales'
 __version__ = '0.4.0'
diff --git a/saber/_workflow.py b/saber/_workflow.py
deleted file mode 100644
index 494b693..0000000
--- a/saber/_workflow.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import pandas as pd
-
-from .prep import hindcast
-
-from .table import gen, cache
-from .cluster import generate, summarize
-from .assign import gauged, propagation, clusters_by_dist
-from .gis import clip_by_assignment, clip_by_cluster, clip_by_unassigned
-from ._calibrate import calibrate_region
-
-
-def analyze_region(workdir: str, drain_shape: str, gauge_table: pd.DataFrame = None, obs_data_dir: str = None) -> None:
-    # Generate the assignments table
-    print("gen assign_table")
-    assign_table = gen(workdir)
-    cache(workdir, assign_table)
-
-    # Generate the clusters using the historical simulation data
-    print("working on clustering")
-    generate(workdir)
-    assign_table = summarize(workdir, assign_table)
-    cache(workdir, assign_table)
-
-    # Assign basins which are gauged and propagate those gauges
-    print("make assignments")
-    assign_table = gauged(assign_table)
-    assign_table = propagation(assign_table)
-    assign_table = clusters_by_dist(assign_table)
-
-    # Cache the assignments table with the updates
-    print("cache table results")
-    cache(workdir, assign_table)
-
-    # Generate GIS files so you can go explore your progress graphically
-    print("generate gis files")
-    clip_by_assignment(workdir, assign_table, drain_shape)
-    clip_by_cluster(workdir, assign_table, drain_shape)
-    clip_by_unassigned(workdir, assign_table, drain_shape)
-
-    # Compute the corrected simulation data
-    print("calibrating region netcdf")
-    calibrate_region(workdir, assign_table, gauge_table, obs_data_dir)
-    return
diff --git a/saber/assign.py b/saber/assign.py
index 54428b7..2f29353 100644
--- a/saber/assign.py
+++ b/saber/assign.py
@@ -24,9 +24,10 @@ def gauged(df: pd.DataFrame) -> pd.DataFrame:
         Copy of df with assignments made
     """
     _df = df.copy()
-    _df.loc[~_df[gid_col].isna(), asgn_mid_col] = _df[mid_col]
-    _df.loc[~_df[gid_col].isna(), asgn_gid_col] = _df[gid_col]
-    _df.loc[~_df[gid_col].isna(), reason_col] = 'gauged'
+    selector = ~_df[gid_col].isna()
+    _df.loc[selector, asgn_mid_col] = _df[mid_col]
+    _df.loc[selector, asgn_gid_col] = _df[gid_col]
+    _df.loc[selector, reason_col] = 'gauged'
     return _df
 
 
@@ -54,26 +55,6 @@ def propagation(df: pd.DataFrame, max_prop: int = 5) -> pd.DataFrame:
     return _df
 
 
-def clusters_by_monavg(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Assigns all possible ungauged basins a gauge that is
-        (1) of the same stream order
-        (2) in the same simulated fdc cluster
-        (3) in the same simulated monavg cluster (monthly averages)
-        (4) closest in total drainage area
-
-    This requires matching 2 clusters. Basins in both of the same groups have high likelihood of behaving similarly.
-
-    Args:
-        df: the assignments table dataframe
-
-    Returns:
-        Copy of df with assignments made
-    """
-    _df = df.copy()
-    return _df
-
-
 def clusters_by_dist(df: pd.DataFrame) -> pd.DataFrame:
     """
     Assigns all possible ungauged basins a gauge that is
@@ -87,7 +68,6 @@ def clusters_by_dist(df: pd.DataFrame) -> pd.DataFrame:
     Returns:
         Copy of df with assignments made
     """
-    # todo assign based on closest upstream drainage area??
     _df = df.copy()
     # first filter by cluster number
     for c_num in sorted(set(_df['sim-fdc-cluster'].values)):
diff --git a/saber/cluster.py b/saber/cluster.py
index a3e1582..a57ecc8 100755
--- a/saber/cluster.py
+++ b/saber/cluster.py
@@ -12,7 +12,6 @@
 
 from ._vocab import cluster_count_file
 from ._vocab import mid_col
-from ._vocab import gid_col
 
 
 def generate(workdir: str) -> None:
@@ -121,26 +120,11 @@ def summarize(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None)
         with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'r') as f:
             n_clusters = int(json.loads(f.read())['historical'])
 
-    pd.DataFrame({
+    # create a dataframe with the optimal model's labels and the model_id's
+    df = pd.DataFrame({
         'cluster': Cluster.from_pickle(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_.flatten(),
-        'model_id': pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten()
-    })
-    for dataset, cluster_count in clusters.items():
-        # read the list of simulated id's, pair them with their cluster label, save to df
-        merge_col = mid_col if "sim" in dataset else gid_col
-        csv_path = os.path.join(workdir, f'data_processed', f'{dataset}.csv')
-        ids = pd.read_csv(csv_path, index_col=0).columns
-
-        # open the optimal model pickle file
-        optimal_model = os.path.join(workdir, 'kmeans_models', f'{dataset}-{cluster_count}-clusters-model.pickle')
-        sim_labels = Cluster.from_pickle(optimal_model).labels_.tolist()
-
-        # create a dataframe of the ids and their labels (assigned groups)
-        df = pd.DataFrame(np.transpose([sim_labels, ids]), columns=[f'{dataset}-cluster', merge_col])
-        df.to_csv(os.path.join(workdir, 'kmeans_models', f'optimal-assigns-{dataset}.csv'), index=False)
-
-        # merge the dataframes
-        df[merge_col] = df[merge_col].astype(int)
-        assign_table = assign_table.merge(df, how='outer', on=merge_col)
-
-    return assign_table
+        mid_col: pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten()
+    }, dtype=str)
+
+    # merge the dataframes
+    return assign_table.merge(df, how='outer', on=mid_col)
diff --git a/saber/gis.py b/saber/gis.py
index 78512f1..2328171 100644
--- a/saber/gis.py
+++ b/saber/gis.py
@@ -18,8 +18,7 @@
            'validation_maps']
 
 
-def generate_all(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '',
-                 id_column: str = mid_col) -> None:
+def generate_all(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None:
     """
     Runs all the clip functions which create subsets of the drainage lines GIS dataset based on how they were assigned
     for bias correction.
@@ -29,19 +28,17 @@ def generate_all(workdir: str, assign_table: pd.DataFrame, drain_shape: str, pre
         assign_table: the assign_table dataframe
         drain_shape: path to a drainage line shapefile which can be clipped
         prefix: a prefix for names of the outputs to distinguish between data generated in separate instances
-        id_column: name of the id column in the attributes of the shape table
 
     Returns:
         None
     """
-    clip_by_assignment(workdir, assign_table, drain_shape, prefix, id_column)
-    clip_by_cluster(workdir, assign_table, drain_shape, prefix, id_column)
-    clip_by_unassigned(workdir, assign_table, drain_shape, prefix, id_column)
+    clip_by_assignment(workdir, assign_table, drain_shape, prefix)
+    clip_by_cluster(workdir, assign_table, drain_shape, prefix)
+    clip_by_unassigned(workdir, assign_table, drain_shape, prefix)
     return
 
 
-def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '',
-                       id_column: str = mid_col) -> None:
+def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None:
     """
     Creates Geopackage files in workdir/gis_outputs for each unique value in the assignment column
 
@@ -50,7 +47,6 @@ def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: st
         assign_table: the assign_table dataframe
         drain_shape: path to a drainage line shapefile which can be clipped
         prefix: a prefix for names of the outputs to distinguish between data generated at separate instances
-        id_column: name of the id column in the attributes of the shape table
 
     Returns:
         None
@@ -62,7 +58,7 @@ def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: st
     # get the unique list of assignment reasons
     for reason in set(assign_table[reason_col].dropna().values):
         ids = assign_table[assign_table[reason_col] == reason][mid_col].values
-        subset = dl[dl[id_column].isin(ids)]
+        subset = dl[dl[mid_col].isin(ids)]
         name = f'{prefix}{"_" if prefix else ""}assignments_{reason}.gpkg'
         if subset.empty:
             continue
@@ -71,8 +67,7 @@ def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: st
     return
 
 
-def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '',
-                    id_column: str = mid_col) -> None:
+def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None:
     """
     Creates Geopackage files in workdir/gis_outputs of the drainage lines based on the fdc cluster they were assigned to
 
@@ -81,26 +76,20 @@ def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_shape: str,
         assign_table: the assign_table dataframe
         drain_shape: path to a drainage line shapefile which can be clipped
         prefix: optional, a prefix to prepend to each created file's name
-        id_column: name of the id column in the attributes of the shape table
 
     Returns:
         None
     """
     dl_gdf = gpd.read_file(drain_shape)
-    cluster_types = [a for a in assign_table if 'cluster' in a]
-    for ctype in cluster_types:
-        for gnum in sorted(set(assign_table[ctype].dropna().values)):
-            savepath = os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}{ctype}-{int(gnum)}.gpkg')
-            ids = assign_table[assign_table[ctype] == gnum][mid_col].values
-            if dl_gdf[dl_gdf[id_column].isin(ids)].empty:
-                continue
-            else:
-                dl_gdf[dl_gdf[id_column].isin(ids)].to_file(savepath)
+    for num in sorted(set(assign_table[mid_col].dropna().values)):
+        gdf = dl_gdf[dl_gdf[mid_col].isin(assign_table[assign_table[mid_col] == num][mid_col])]
+        if gdf.empty:
+            continue
+        gdf.to_file(os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}-{int(num)}.gpkg'))
     return
 
 
-def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '',
-                       id_column: str = mid_col) -> None:
+def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None:
     """
     Creates Geopackage files in workdir/gis_outputs of the drainage lines which haven't been assigned a gauge yet
 
@@ -109,14 +98,13 @@ def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_shape: st
         assign_table: the assign_table dataframe
         drain_shape: path to a drainage line shapefile which can be clipped
         prefix: optional, a prefix to prepend to each created file's name
-        id_column: name of the id column in the attributes of the shape table
 
     Returns:
         None
     """
     dl_gdf = gpd.read_file(drain_shape)
     ids = assign_table[assign_table[reason_col].isna()][mid_col].values
-    subset = dl_gdf[dl_gdf[id_column].isin(ids)]
+    subset = dl_gdf[dl_gdf[mid_col].isin(ids)]
     if subset.empty:
         warnings.warn('Empty filter: No streams are unassigned')
         return

From a63de91acab4a2560cbd0903407f04a7ab6e0674 Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Tue, 23 Aug 2022 20:39:49 -0600
Subject: [PATCH 10/24] delete example scripts

---
 .../colombia-magdalena/magdalena_example.py   | 73 -------------------
 examples/example_inputs.py                    | 16 ----
 examples/example_script.py                    | 70 ------------------
 3 files changed, 159 deletions(-)
 delete mode 100644 examples/colombia-magdalena/magdalena_example.py
 delete mode 100644 examples/example_inputs.py
 delete mode 100644 examples/example_script.py

diff --git a/examples/colombia-magdalena/magdalena_example.py b/examples/colombia-magdalena/magdalena_example.py
deleted file mode 100644
index 2b0bf39..0000000
--- a/examples/colombia-magdalena/magdalena_example.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import os
-
-import numpy as np
-
-import saber
-
-
-np.seterr(all="ignore")
-
-workdir = '/Users/rchales/data/regional-bias-correction/colombia-magdalena'
-drain_shape = os.path.join(workdir, 'gis_inputs', 'magdalena_dl_attrname_xy.json')
-gauge_shape = os.path.join(workdir, 'gis_inputs', 'ideam_stations.json')
-obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
-
-# Only need to do this step 1x ever
-# saber.prep.scaffold_working_directory(workdir)
-
-# Create the gauge_table and drain_table.csv
-# Scripts not provided, check readme for instructions
-
-# Generate the assignments table
-# assign_table = saber.table.gen(workdir)
-# saber.table.cache(workdir, assign_table)
-# Or read the existing table
-# assign_table = saber.table.read(workdir)
-
-# Prepare the observation and simulation data
-# Only need to do this step 1x ever
-# saber.prep.historical_simulation(os.path.join(workdir, 'data_simulated', 'south_america_era5_qout.nc'), workdir)
-# saber.prep.observation_data(workdir)
-
-# Generate the clusters using the historical simulation data
-# saber.cluster.generate(workdir)
-# assign_table = saber.cluster.summarize(workdir, assign_table)
-# saber.table.cache(workdir, assign_table)
-
-# Assign basins which are gauged and propagate those gauges
-# assign_table = saber.assign.gauged(assign_table)
-# assign_table = saber.assign.propagation(assign_table)
-# assign_table = saber.assign.clusters_by_dist(assign_table)
-# todo assign_table = saber.assign.clusters_by_monavg(assign_table)
-
-# Cache the assignments table with the updates
-# saber.table.cache(workdir, assign_table)
-
-# Generate GIS files so you can go explore your progress graphically
-# saber.gis.clip_by_assignment(workdir, assign_table, drain_shape)
-# saber.gis.clip_by_cluster(workdir, assign_table, drain_shape)
-# saber.gis.clip_by_unassigned(workdir, assign_table, drain_shape)
-
-# Compute the corrected simulation data
-# assign_table = saber.table.read(workdir)
-# saber.calibrate_region(workdir, assign_table)
-# vtab = saber.validate.gen_val_table(workdir)
-saber.gis.validation_maps(workdir, gauge_shape)
-saber.analysis.plot(workdir, obs_data_dir, 9007721)
-
-
-# import pandas as pd
-# path_to_your_pickle_file = '/Users/rchales/data/regional-bias-correction/colombia-magdalena/validation_runs/90/data_processed/subset_time_series.pickle'
-# a = pd.read_pickle(path_to_your_pickle_file)
-# a.index = a.index.tz_localize('UTC')
-# a.to_pickle(path_to_your_pickle_file)
-
-
-# import netCDF4 as nc
-# import numpy as np
-# path = '/Users/rchales/data/regional-bias-correction/colombia-magdalena/validation_runs/90/calibrated_simulated_flow.nc'
-# a = nc.Dataset(path, 'r+')
-# a.createVariable('corrected', 'i4', ('corrected',), zlib=True, shuffle=True, fill_value=-1)
-# a['corrected'][:] = np.array((0, 1))
-# a.sync()
-# a.close()
\ No newline at end of file
diff --git a/examples/example_inputs.py b/examples/example_inputs.py
deleted file mode 100644
index 4163d24..0000000
--- a/examples/example_inputs.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import os
-
-
-# COLOMBIA
-workdir = '/Users/rchales/data/saber/colombia-magdalena'
-drain_shape = os.path.join(workdir, 'gis_inputs', 'magdalena_dl_attrname_xy.json')
-gauge_shape = os.path.join(workdir, 'gis_inputs', 'ideam_stations.json')
-obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
-hist_sim_nc = os.path.join(workdir, 'data_inputs', 'south_america_era5_qout.nc')
-
-# TEXAS
-workdir = '/Users/rchales/data/saber/texas'
-drain_shape = os.path.join(workdir, 'shapefiles', 'texas-dl.json')
-gauge_shape = os.path.join(workdir, 'shapefiles', 'texas-gauges.shp')
-obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
-hist_sim_nc = os.path.join(workdir, 'data_inputs', 'Qout_era5_t640_24hr_19790101to20210630.nc.nc')
diff --git a/examples/example_script.py b/examples/example_script.py
deleted file mode 100644
index b114050..0000000
--- a/examples/example_script.py
+++ /dev/null
@@ -1,70 +0,0 @@
-import os
-
-import numpy as np
-
-import saber
-
-
-np.seterr(all="ignore")
-
-# workdir = ''
-# drain_shape = os.path.join(workdir, 'gis_inputs', '')
-# gauge_shape = ''
-# obs_data_dir = ''
-# hist_sim_nc = ''
-
-# COLOMBIA
-# workdir = '/Users/rchales/data/saber/colombia-magdalena'
-# drain_shape = os.path.join(workdir, 'gis_inputs', 'magdalena_dl_attrname_xy.json')
-# gauge_shape = os.path.join(workdir, 'gis_inputs', 'ideam_stations.json')
-# obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
-# hist_sim_nc = os.path.join(workdir, 'data_inputs', 'south_america_era5_qout.nc')
-
-
-# Prepare the working directory - only need to do this step 1x ever
-# saber.prep.scaffold_working_directory(workdir)
-# Scripts not provided. Consult README.md for instructions
-# Create the gauge_table.csv and drain_table.csv
-# Put the historical simulation netCDF in the right folder
-# Put the observed data csv files in the data_inputs/obs_csvs folder
-
-# Prepare the observation and simulation data - Only need to do this step 1x ever
-print('Preparing data')
-saber.prep.hindcast(workdir)
-
-# Generate the assignments table
-print('Generate Assignment Table')
-assign_table = saber.table.gen(workdir)
-saber.table.cache(workdir, assign_table)
-
-# Generate the clusters using the historical simulation data
-print('Generate Clusters')
-saber.cluster.generate(workdir)
-assign_table = saber.cluster.summarize(workdir, assign_table)
-saber.table.cache(workdir, assign_table)
-
-# Assign basins which are gauged and propagate those gauges
-print('Making Assignments')
-assign_table = saber.assign.gauged(assign_table)
-assign_table = saber.assign.propagation(assign_table)
-assign_table = saber.assign.clusters_by_dist(assign_table)
-
-# Cache the assignments table with the updates
-saber.table.cache(workdir, assign_table)
-
-# Generate GIS files so you can go explore your progress graphically
-print('Generate GIS files')
-saber.gis.clip_by_assignment(workdir, assign_table, drain_shape)
-saber.gis.clip_by_cluster(workdir, assign_table, drain_shape)
-saber.gis.clip_by_unassigned(workdir, assign_table, drain_shape)
-
-# Compute the corrected simulation data
-print('Starting Calibration')
-saber.calibrate_region(workdir, assign_table)
-
-# run the validation study
-print('Performing Validation')
-saber.validate.sample_gauges(workdir, overwrite=True)
-saber.validate.run_series(workdir, drain_shape, obs_data_dir)
-vtab = saber.validate.gen_val_table(workdir)
-saber.gis.validation_maps(workdir, gauge_shape, vtab)

From 13d0f5d97894ec952a6dcd76e5c3885267bfe312 Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Tue, 23 Aug 2022 20:50:55 -0600
Subject: [PATCH 11/24] new example script

---
 README.md                 |  8 ++------
 examples/saber_example.py | 43 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 6 deletions(-)
 create mode 100644 examples/saber_example.py

diff --git a/README.md b/README.md
index 9951d5f..7d639d0 100755
--- a/README.md
+++ b/README.md
@@ -104,7 +104,7 @@ Your table should look like this:
 | unique_stream_#     | unique_stream_# | area in km^2      | stream_order | ##  | ##  |  
 | ...                 | ...             | ...               | ...          | ... | ... |
 
-2. Prepare a csv of the attribute table of the gauge locations shapefile.
+1. Prepare a csv of the attribute table of the gauge locations shapefile.
    - You need the columns:
      - model_id
      - gauge_id
@@ -157,7 +157,7 @@ looks like this:
 ```python
 import saber as saber
 workdir = '/path/to/project/directory/'
-saber.prep.gen_assignments_table(workdir)
+saber.table.gen(workdir)
 ```
 
 Your project's working directory now looks like
@@ -230,10 +230,6 @@ saber.prep.hindcast(
     workdir,
     '/path/to/historical/simulation/netcdf.nc'  # optional - if nc not stored in data_inputs folder
 )
-saber.prep.hist_sim_table(
-    workdir,
-    '/path/to/historical/simulation/netcdf.nc'  # optional - if nc not stored in data_inputs folder
-)
 saber.prep.observed_data(
     workdir,
     '/path/to/obs/csv/directory'  # optional - if csvs not stored in workdir/data_inputs/obs_csvs
diff --git a/examples/saber_example.py b/examples/saber_example.py
new file mode 100644
index 0000000..44c57a3
--- /dev/null
+++ b/examples/saber_example.py
@@ -0,0 +1,43 @@
+import os
+
+import numpy as np
+import saber
+
+
+np.seterr(all="ignore")
+
+workdir = ''
+hist_sim_nc = os.path.join(workdir, '')
+obs_data_dir = os.path.join(workdir, '')
+drain_gis = os.path.join(workdir, '')
+gauge_gis = os.path.join(workdir, '')
+
+print('Prepare inputs')
+saber.prep.gis_tables(workdir, drain_gis=drain_gis, gauge_gis=gauge_gis)
+saber.prep.hindcast(workdir)
+
+# Generate the assignments table
+print('Generate Assignment Table')
+assign_table = saber.table.gen(workdir)
+saber.table.cache(workdir, assign_table)
+
+# Generate clusters using the historical simulation data
+print('Generate Clusters')
+saber.cluster.generate(workdir)
+saber.cluster.plot(workdir)
+saber.cluster.summarize(workdir, assign_table)
+
+# Assign basins which are gauged and propagate those gauges
+print('Making Assignments')
+assign_table = saber.assign.gauged(assign_table)
+assign_table = saber.assign.propagation(assign_table)
+assign_table = saber.assign.clusters_by_dist(assign_table)
+
+# Cache the assignments table with the updates
+saber.table.cache(workdir, assign_table)
+
+# Generate GIS files to explore your progress graphically
+print('Generate GIS files')
+saber.gis.clip_by_assignment(workdir, assign_table, drain_gis)
+saber.gis.clip_by_cluster(workdir, assign_table, drain_gis)
+saber.gis.clip_by_unassigned(workdir, assign_table, drain_gis)

From b22d6dee4db2e963c4a1a6640b01d13fdda36a81 Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Thu, 25 Aug 2022 00:47:52 -0600
Subject: [PATCH 12/24] drop tslearn, fdc calculated in 1 percent intervals

---
 README.md                |  9 +-----
 conda-environment.yml    |  1 -
 requirements.txt         |  1 -
 saber/cluster.py         | 30 ++++++++++---------
 saber/prep.py            | 28 +++++++++---------
 saber/table.py           |  6 ++--
 saber/tests/calibrate.py | 63 ----------------------------------------
 saber/validate.py        | 25 ++--------------
 8 files changed, 36 insertions(+), 127 deletions(-)
 delete mode 100644 saber/tests/calibrate.py

diff --git a/README.md b/README.md
index 7d639d0..19a7c1e 100755
--- a/README.md
+++ b/README.md
@@ -38,18 +38,11 @@ Things to check when preparing your data
 ## Process
 ### 1 Create a Working Directory
 
-```python
-import saber
-
-workdir = '/my/file/path'
-saber.prep.scaffold_workdir(workdir)
-```
-
 Your working directory should exactly like this. 
 ```
 working_directory
-    data_inputs/
     tables/
+    data_inputs/
     kmeans_outputs/
     gis_outputs/
 ```
diff --git a/conda-environment.yml b/conda-environment.yml
index 2a9783c..be858a9 100644
--- a/conda-environment.yml
+++ b/conda-environment.yml
@@ -10,7 +10,6 @@ dependencies:
   - numba
   - contextily
   - netCDF4
-  - tslearn
   - pyarrow
   - kneed
   - tslearn
diff --git a/requirements.txt b/requirements.txt
index 99ea19d..6cf092b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,7 +8,6 @@ netCDF4
 tslearn
 pyarrow
 kneed
-tslearn
 matplotlib
 requests
 scipy
diff --git a/saber/cluster.py b/saber/cluster.py
index a57ecc8..f83aa9e 100755
--- a/saber/cluster.py
+++ b/saber/cluster.py
@@ -7,7 +7,8 @@
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from tslearn.clustering import TimeSeriesKMeans as Cluster
+from sklearn.cluster import KMeans as Clusterer
+import joblib
 from natsort import natsorted
 
 from ._vocab import cluster_count_file
@@ -24,27 +25,29 @@ def generate(workdir: str) -> None:
     Returns:
         None
     """
-    inertia = {'number': [], 'inertia': [], 'n_iter': []}
+    summary = {'number': [], 'inertia': [], 'n_iter': [], 'pseudo_aic': []}
 
     # read the prepared data (array x)
     x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet'))
     x = x.values
+    n_feat = x.shape[0]
 
     # build the kmeans model for a range of cluster numbers
-    for n_clusters in range(1, 17):
+    for n_clusters in range(1, 16):
         print(n_clusters)
-        ks = Cluster(n_clusters=n_clusters, max_iter=150)
-        ks.fit_predict(x)
-        ks.to_pickle(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle'))
-        inertia['number'].append(n_clusters)
-        inertia['inertia'].append(ks.inertia_)
-        inertia['n_iter'].append(ks.n_iter_)
+        kmeans = Clusterer(n_clusters=n_clusters)
+        kmeans.fit_predict(x)
+        joblib.dump(kmeans, os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle'))
+        summary['number'].append(n_clusters)
+        summary['inertia'].append(kmeans.inertia_)
+        summary['pseudo_aic'].append(np.log(kmeans.inertia_ / n_feat) + ((n_feat + 2 * n_clusters) / (n_feat)))
+        summary['n_iter'].append(kmeans.n_iter_)
 
     # save the inertia results as a csv
-    pd.DataFrame.from_dict(inertia).to_csv(os.path.join(workdir, 'kmeans_outputs', f'cluster-inertia.csv'))
+    pd.DataFrame.from_dict(summary).to_csv(os.path.join(workdir, 'kmeans_outputs', f'clustering-summary-stats.csv'))
 
     # find the knee/elbow
-    knee = kneed.KneeLocator(inertia['number'], inertia['inertia'], curve='convex', direction='decreasing').knee
+    knee = kneed.KneeLocator(summary['number'], summary['inertia'], curve='convex', direction='decreasing').knee
 
     # save the best fitting cluster counts to a csv
     with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'w') as f:
@@ -74,7 +77,7 @@ def plot(workdir: str) -> None:
     x_ticks = np.linspace(0, 100, 5).astype(int)
 
     for model_pickle in natsorted(glob.glob(os.path.join(workdir, 'kmeans_outputs', 'kmeans-*.pickle'))):
-        kmeans = Cluster.from_pickle(model_pickle)
+        kmeans = joblib.load(model_pickle)
         n_clusters = int(kmeans.n_clusters)
         n_cols = min(n_clusters, max_cols)
         n_rows = math.ceil(n_clusters / n_cols)
@@ -122,7 +125,8 @@ def summarize(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None)
 
     # create a dataframe with the optimal model's labels and the model_id's
     df = pd.DataFrame({
-        'cluster': Cluster.from_pickle(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_.flatten(),
+        'cluster': Clusterer.from_pickle(
+            os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_.flatten(),
         mid_col: pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten()
     }, dtype=str)
 
diff --git a/saber/prep.py b/saber/prep.py
index 0890bd3..a40f332 100755
--- a/saber/prep.py
+++ b/saber/prep.py
@@ -4,14 +4,14 @@
 import geopandas as gpd
 import netCDF4 as nc
 import numpy as np
-from tslearn.preprocessing import TimeSeriesScalerMeanVariance
+from sklearn.preprocessing import StandardScaler as Scalar
 
 from ._vocab import mid_col
 from ._vocab import read_drain_table
 from ._vocab import guess_hindcast_path
 from ._vocab import get_table_path
 
-__all__ = ['gis_tables', 'hindcast', 'scaffold_workdir']
+__all__ = ['gis_tables', 'hindcast', 'workdir']
 
 
 def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> None:
@@ -27,8 +27,7 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No
         None
     """
     if gauge_gis is not None:
-        gdf = gpd.read_file(gauge_gis).drop('geometry', axis=1)
-        pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet(
+        pd.DataFrame(gpd.read_file(gauge_gis).drop('geometry', axis=1)).to_parquet(
             os.path.join(workdir, 'tables', 'gauge_table.parquet'))
     if drain_gis is not None:
         gdf = gpd.read_file(drain_gis)
@@ -78,37 +77,36 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None:
     df.to_parquet(get_table_path(workdir, 'hindcast_series'))
 
     # calculate the FDC and save to parquet
-    exceed_prob = np.linspace(0, 100, 201)[::-1]
+    exceed_prob = np.linspace(0, 100, 101)[::-1]
     df = df.apply(lambda x: np.transpose(np.nanpercentile(x, exceed_prob)))
     df.index = exceed_prob
     df.index.name = 'exceed_prob'
     df.to_parquet(get_table_path(workdir, 'hindcast_fdc'))
 
     # transform and prepare for clustering
-    df = pd.DataFrame(np.squeeze(TimeSeriesScalerMeanVariance().fit_transform(np.squeeze(np.transpose(df.values)))))
+    df = pd.DataFrame(np.transpose(Scalar().fit_transform(np.squeeze(df.values))))
     df.index = ids
     df.columns = df.columns.astype(str)
     df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet'))
     return
 
 
-def scaffold_workdir(path: str, include_validation: bool = True) -> None:
+def workdir(path: str, include_validation: bool = True) -> None:
     """
     Creates the correct directories for a Saber project within the specified directory
 
     Args:
-        path: the path to a directory where you want to create directories
+        path: the path to a directory where you want to create workdir subdirectories
         include_validation: boolean, indicates whether to create the validation folder
 
     Returns:
         None
     """
-    if not os.path.isdir(path):
-        os.mkdir(path)
-    os.mkdir(os.path.join(path, 'tables'))
-    os.mkdir(os.path.join(path, 'inputs'))
-    os.mkdir(os.path.join(path, 'gis_outputs'))
-    os.mkdir(os.path.join(path, 'kmeans_outputs'))
+    dir_list = ['tables', 'data_inputs', 'gis_outputs', 'kmeans_outputs']
     if include_validation:
-        os.mkdir(os.path.join(path, 'validation_runs'))
+        dir_list.append('validation_runs')
+    for d in dir_list:
+        p = os.path.join(path, d)
+        if not os.path.exists(p):
+            os.mkdir(p)
     return
diff --git a/saber/table.py b/saber/table.py
index 01d9c9b..f1aafe4 100644
--- a/saber/table.py
+++ b/saber/table.py
@@ -35,8 +35,8 @@ def gen(workdir) -> pd.DataFrame:
         None
     """
     # read and merge the tables
-    drain_df = pd.read_parquet(os.path.join(workdir, 'tables', 'drain_table.parquet.gzip'))
-    gauge_df = pd.read_parquet(os.path.join(workdir, 'tables', 'gauge_table.parquet.gzip'))
+    drain_df = pd.read_parquet(os.path.join(workdir, 'tables', 'drain_table.parquet'))
+    gauge_df = pd.read_parquet(os.path.join(workdir, 'tables', 'gauge_table.parquet'))
     assign_table = pd.merge(drain_df, gauge_df, on=mid_col, how='outer')
 
     # create the new columns
@@ -75,6 +75,6 @@ def cache(workdir: str, assign_table: pd.DataFrame) -> None:
     Returns:
         None
     """
-    assign_table.to_parquet(get_path(workdir), compression='gzip')
+    assign_table.to_parquet(get_path(workdir))
     return
 
diff --git a/saber/tests/calibrate.py b/saber/tests/calibrate.py
deleted file mode 100644
index c58fa40..0000000
--- a/saber/tests/calibrate.py
+++ /dev/null
@@ -1,63 +0,0 @@
-from io import StringIO
-
-import geoglows
-import pandas as pd
-import requests
-
-
-def collect_data(start_id, start_ideam_id, downstream_id, downstream_ideam_id):
-    # Upstream simulated flow
-    start = geoglows.streamflow.historic_simulation(start_id)
-    # Upstream observed flow
-    start_ideam = get_ideam_flow(start_ideam_id)
-    start_ideam.dropna(inplace=True)
-
-    # Downstream simulated flow
-    downstream = geoglows.streamflow.historic_simulation(downstream_id)
-    # Downstream observed flow
-    downstream_ideam = get_ideam_flow(downstream_ideam_id)
-    downstream_ideam.dropna(inplace=True)
-    # Downstream bias corrected flow (for comparison to the data_4_assign_propagation method
-    downstream_bc = geoglows.bias.correct_historical(downstream, downstream_ideam)
-
-    # Export all as csv
-    start.to_csv('start_flow.csv')
-    start_ideam.to_csv('start_ideam_flow.csv')
-    downstream.to_csv('downstream_flow.csv')
-    downstream_ideam.to_csv('downstream_ideam_flow.csv')
-    downstream_bc.to_csv('downstream_bc_flow.csv')
-    return
-
-
-def get_ideam_flow(id):
-    # get the gauged data
-    url = f'https://www.hydroshare.org/resource/d222676fbd984a81911761ca1ba936bf/' \
-          f'data/contents/Discharge_Data/{id}.csv'
-    df = pd.read_csv(StringIO(requests.get(url).text), index_col=0)
-    df.index = pd.to_datetime(df.index).tz_localize('UTC')
-    return df
-
-# collect_data(9012999, 22057070, 9012650, 22057010)
-# collect_data(9017261, 32037030, 9015333, 32097010)  # really long range down stream
-# collect_data(9009660, 21237020, 9007292, 23097040)  # large river
-# collect_data(9007292, 23097040, 9009660, 21237020)  # large river backwards (going upstream)
-
-# Read all as csv
-start_flow = pd.read_csv('data_4_assign_propagation/start_flow.csv', index_col=0)
-start_ideam_flow = pd.read_csv('data_4_assign_propagation/start_ideam_flow.csv', index_col=0)
-downstream_flow = pd.read_csv('data_4_assign_propagation/downstream_flow.csv', index_col=0)
-downstream_ideam_flow = pd.read_csv('data_4_assign_propagation/downstream_ideam_flow.csv', index_col=0)
-downstream_bc_flow = pd.read_csv('data_4_assign_propagation/downstream_bc_flow.csv', index_col=0)
-start_flow.index = pd.to_datetime(start_flow.index)
-start_ideam_flow.index = pd.to_datetime(start_ideam_flow.index)
-downstream_flow.index = pd.to_datetime(downstream_flow.index)
-downstream_ideam_flow.index = pd.to_datetime(downstream_ideam_flow.index)
-downstream_bc_flow.index = pd.to_datetime(downstream_bc_flow.index)
-
-downstream_prop_correct = rbc_calibrate(start_flow, start_ideam_flow, downstream_flow,
-                                        fit_gumbel=True, gumbel_range=(25, 75))
-plot_results(downstream_flow, downstream_ideam_flow, downstream_bc_flow, downstream_prop_correct,
-             f'Correct Monthly - Force Gumbel Distribution')
-del downstream_prop_correct['Scalars'], downstream_prop_correct['Percentile']
-statistics_tables(downstream_prop_correct, downstream_flow, downstream_ideam_flow).to_csv(
-    'data_4_assign_propagation/stats_test.csv')
diff --git a/saber/validate.py b/saber/validate.py
index f4969a2..c27198f 100644
--- a/saber/validate.py
+++ b/saber/validate.py
@@ -6,8 +6,7 @@
 import netCDF4 as nc
 import numpy as np
 
-from .prep import scaffold_workdir
-from ._workflow import analyze_region
+from .prep import workdir
 from ._vocab import mid_col
 from ._vocab import gid_col
 from ._vocab import metric_nc_name_list
@@ -45,7 +44,7 @@ def sample_gauges(workdir: str, overwrite: bool = False) -> None:
 
         # create the new project working directory
         os.mkdir(subpath)
-        scaffold_workdir(subpath, include_validation=False)
+        workdir(subpath, include_validation=False)
 
         # overwrite the processed data directory so we don't need to redo this each time
         shutil.copytree(
@@ -70,26 +69,6 @@ def sample_gauges(workdir: str, overwrite: bool = False) -> None:
     return
 
 
-def run_series(workdir: str, drain_shape: str, obs_data_dir: str = None) -> None:
-    """
-    Runs saber.analyze_region on each project in the validation_runs directory
-
-    Args:
-        workdir: the project working directory
-        drain_shape: path to the drainage line gis file
-        obs_data_dir: path to the directory containing the observed data
-
-    Returns:
-        None
-    """
-    gauge_table = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'gauge_table.csv'))
-    val_workdirs = [i for i in glob.glob(os.path.join(workdir, 'validation_runs', '*')) if os.path.isdir(i)]
-    for val_workdir in val_workdirs:
-        print(f'\n\n\t\t\tworking on {val_workdir}\n\n')
-        analyze_region(val_workdir, drain_shape, gauge_table, obs_data_dir)
-    return
-
-
 def gen_val_table(workdir: str) -> pd.DataFrame:
     """
     Prepares the validation summary table that contains the list of gauged rivers plus their statistics computed in

From b7f3402c7b49758f4d399b18b193fd44e3c1d1f6 Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Thu, 25 Aug 2022 00:49:31 -0600
Subject: [PATCH 13/24] drop geoglows dependency

---
 conda-environment.yml | 3 +--
 requirements.txt      | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/conda-environment.yml b/conda-environment.yml
index be858a9..5111979 100644
--- a/conda-environment.yml
+++ b/conda-environment.yml
@@ -12,9 +12,8 @@ dependencies:
   - netCDF4
   - pyarrow
   - kneed
-  - tslearn
   - matplotlib
-  - geoglows
   - requests
   - scipy
   - natsort
+  - scikit-learn
diff --git a/requirements.txt b/requirements.txt
index 6cf092b..7d43255 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,10 +5,10 @@ numpy
 numba
 contextily
 netCDF4
-tslearn
 pyarrow
 kneed
 matplotlib
 requests
 scipy
-natsort
\ No newline at end of file
+natsort
+scikit-learn
\ No newline at end of file

From 14c24a0fc71a6c20fdfb25c47b33fd6f4a02345c Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Thu, 25 Aug 2022 16:15:28 -0600
Subject: [PATCH 14/24] split cluster functions, code formatting, mark todo's

---
 README.md         |  34 +++++-----
 saber/_vocab.py   |   5 +-
 saber/assign.py   |  11 ++--
 saber/cluster.py  | 162 ++++++++++++++++++++++++++++++++++++++--------
 saber/gis.py      |  55 ++++++++--------
 saber/prep.py     |  18 ++++--
 saber/table.py    |   4 +-
 saber/validate.py |   7 +-
 8 files changed, 203 insertions(+), 93 deletions(-)

diff --git a/README.md b/README.md
index 19a7c1e..e4016a9 100755
--- a/README.md
+++ b/README.md
@@ -16,10 +16,10 @@ You need the following data to follow this procedure. Geopackage format GIS data
 equivalent formats that can be read by `geopandas`.
 1. Geopackage drainage Lines (usually delineated center lines) and catchments/subbasins (polygons) in the watershed. The 
    attribute tables for both should contain (at least) the following entries for each feature:
-    - An identifier column (alphanumeric) labeled `model_id`.
-    - The ID of the next downstream reach/subbasin
-    - The stream order of each reach/subbasin
-    - Cumulative upstream drainage area
+    - An identifier column (alphanumeric) labeled `model_id`
+    - The ID of the next downstream reach/subbasin labeled `downstream_model_id`
+    - The stream order of each reach/subbasin labeled `order`
+    - Cumulative upstream drainage area labeled 
     - The x coordinate of the centroid of each feature (precalculated for faster results later)
     - The y coordinate of the centroid of each feature (precalculated for faster results later)
 2. Geopackage points representing the location of each of the river gauging stations available. The attribute table 
@@ -48,7 +48,7 @@ working_directory
 ```
 
 ### 2 Prepare Spatial Data (scripts not provided)
-This step instructs you to collect 3 gis files and use them to generate 2 csv tables. All 5 files (3 gis files and 2
+This step instructs you to collect 3 gis files and use them to generate 2 tables. All 5 files (3 gis files and 2
 tables) should go in the `gis_inputs` directory 
 
 1. Clip model drainage lines and catchments shapefile to extents of the region of interest. 
@@ -90,12 +90,12 @@ gdf.to_file('/file/path/to/save', driver='GeoJSON')
 
 Your table should look like this:
 
-| downstream_model_id | model_id        | drainage_area_mod | stream_order | x   | y   |  
-|---------------------|-----------------|-------------------|--------------|-----|-----|
-| unique_stream_#     | unique_stream_# | area in km^2      | stream_order | ##  | ##  |
-| unique_stream_#     | unique_stream_# | area in km^2      | stream_order | ##  | ##  |  
-| unique_stream_#     | unique_stream_# | area in km^2      | stream_order | ##  | ##  |  
-| ...                 | ...             | ...               | ...          | ... | ... |
+| downstream_model_id | model_id        | model_drain_area | stream_order | x   | y   |  
+|---------------------|-----------------|------------------|--------------|-----|-----|
+| unique_stream_#     | unique_stream_# | area in km^2     | stream_order | ##  | ##  |
+| unique_stream_#     | unique_stream_# | area in km^2     | stream_order | ##  | ##  |  
+| unique_stream_#     | unique_stream_# | area in km^2     | stream_order | ##  | ##  |  
+| ...                 | ...             | ...              | ...          | ... | ... |
 
 1. Prepare a csv of the attribute table of the gauge locations shapefile.
    - You need the columns:
@@ -105,12 +105,12 @@ Your table should look like this:
 
 Your table should look like this (column order is irrelevant):
 
-| model_id          | drainage_area_obs | gauge_id         |
-|-------------------|-------------------|------------------|
-| unique_stream_num | area in km^2      | unique_gauge_num |
-| unique_stream_num | area in km^2      | unique_gauge_num |
-| unique_stream_num | area in km^2      | unique_gauge_num |
-| ...               | ...               | ...              |
+| model_id          | gauge_drain_area | gauge_id         |
+|-------------------|------------------|------------------|
+| unique_stream_num | area in km^2     | unique_gauge_num |
+| unique_stream_num | area in km^2     | unique_gauge_num |
+| unique_stream_num | area in km^2     | unique_gauge_num |
+| ...               | ...              | ...              |
 
 Your project's working directory now looks like
 ```
diff --git a/saber/_vocab.py b/saber/_vocab.py
index 923c3f1..43d130b 100644
--- a/saber/_vocab.py
+++ b/saber/_vocab.py
@@ -2,9 +2,10 @@
 import glob
 import pandas as pd
 
+# todo enforce using this module for retrieving names??
+
 # assign table and gis_input file required column names
-# mid_col = 'model_id'
-mid_col = 'COMID'
+mid_col = 'model_id'
 gid_col = 'gauge_id'
 asgn_mid_col = 'assigned_model_id'
 asgn_gid_col = 'assigned_gauge_id'
diff --git a/saber/assign.py b/saber/assign.py
index 2f29353..9205793 100644
--- a/saber/assign.py
+++ b/saber/assign.py
@@ -1,16 +1,15 @@
 import numpy as np
 import pandas as pd
 
+from ._propagation import propagate_in_table
 from ._propagation import walk_downstream
 from ._propagation import walk_upstream
-from ._propagation import propagate_in_table
-
-from ._vocab import mid_col
-from ._vocab import gid_col
-from ._vocab import asgn_mid_col
 from ._vocab import asgn_gid_col
-from ._vocab import reason_col
+from ._vocab import asgn_mid_col
+from ._vocab import gid_col
+from ._vocab import mid_col
 from ._vocab import order_col
+from ._vocab import reason_col
 
 
 def gauged(df: pd.DataFrame) -> pd.DataFrame:
diff --git a/saber/cluster.py b/saber/cluster.py
index f83aa9e..b3c4f96 100755
--- a/saber/cluster.py
+++ b/saber/cluster.py
@@ -1,50 +1,97 @@
+import glob
+import json
 import math
 import os
-import json
-import glob
+from collections.abc import Iterable
 
+import joblib
 import kneed
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
-from sklearn.cluster import KMeans as Clusterer
-import joblib
 from natsort import natsorted
+from sklearn.cluster import KMeans as Clusterer
+from sklearn.metrics import silhouette_samples
 
 from ._vocab import cluster_count_file
 from ._vocab import mid_col
 
+__all__ = ['generate', 'summarize', 'plot_clusters', 'plot_silhouette', 'merge_assign_table']
 
-def generate(workdir: str) -> None:
+
+def generate(workdir: str, max_clusters: int = 12) -> None:
     """
     Trains kmeans clustering models, saves the model as pickle, generates images and supplementary files
 
     Args:
         workdir: path to the project directory
+        max_clusters: maximum number of clusters to train
 
     Returns:
         None
     """
-    summary = {'number': [], 'inertia': [], 'n_iter': [], 'pseudo_aic': []}
-
     # read the prepared data (array x)
     x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet'))
     x = x.values
-    n_feat = x.shape[0]
 
     # build the kmeans model for a range of cluster numbers
-    for n_clusters in range(1, 16):
+    for n_clusters in range(2, max_clusters + 1):
         print(n_clusters)
         kmeans = Clusterer(n_clusters=n_clusters)
         kmeans.fit_predict(x)
         joblib.dump(kmeans, os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle'))
+    return
+
+
+def summarize(workdir: str) -> None:
+    """
+    Generate a summary of the clustering results, calculate the silhouette score, save the centers and labels to parquet
+
+    Args:
+        workdir: path to the project directory
+
+    Returns:
+
+    """
+    summary = {'number': [], 'inertia': [], 'n_iter': [], 'silhouette': []}
+    silhouette_scores = []
+    labels = []
+
+    # read the prepared data (array x)
+    x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet'))
+    x = x.values
+
+    for model_file in natsorted(glob.glob(os.path.join(workdir, 'kmeans_outputs', 'kmeans-*.pickle'))):
+        kmeans = joblib.load(model_file)
+        n_clusters = int(kmeans.n_clusters)
+
+        # save the cluster centroids to table - columns are the cluster number, rows are the centroid FDC values
+        pd.DataFrame(
+            np.transpose(kmeans.cluster_centers_),
+            columns=np.array(range(n_clusters)).astype(str)
+        ).to_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}-centers.parquet'))
+
+        # save the silhouette score for each cluster
+        silhouette_scores.append(silhouette_samples(x, kmeans.labels_).flatten())
+        labels.append(kmeans.labels_.flatten())
+
+        # save the summary stats from this model
         summary['number'].append(n_clusters)
         summary['inertia'].append(kmeans.inertia_)
-        summary['pseudo_aic'].append(np.log(kmeans.inertia_ / n_feat) + ((n_feat + 2 * n_clusters) / (n_feat)))
         summary['n_iter'].append(kmeans.n_iter_)
-
-    # save the inertia results as a csv
-    pd.DataFrame.from_dict(summary).to_csv(os.path.join(workdir, 'kmeans_outputs', f'clustering-summary-stats.csv'))
+        summary['silhouette'].append(np.mean(silhouette_scores[-1]))
+
+    # save the summary results as a csv
+    pd.DataFrame.from_dict(summary) \
+        .to_csv(os.path.join(workdir, 'kmeans_outputs', f'clustering-summary-stats.csv'))
+    # save the silhouette scores as a parquet
+    silhouette_scores = np.transpose(np.array(silhouette_scores))
+    pd.DataFrame(silhouette_scores, columns=np.array(range(2, silhouette_scores.shape[1] + 2)).astype(str)) \
+        .to_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-silhouette_scores.parquet'))
+    # save the labels as a parquet
+    labels = np.transpose(np.array(labels))
+    pd.DataFrame(labels, columns=np.array(range(2, labels.shape[1] + 2)).astype(str)) \
+        .to_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-labels.parquet'))
 
     # find the knee/elbow
     knee = kneed.KneeLocator(summary['number'], summary['inertia'], curve='convex', direction='decreasing').knee
@@ -55,35 +102,52 @@ def generate(workdir: str) -> None:
     return
 
 
-def plot(workdir: str) -> None:
+def plot_clusters(workdir: str, clusters: int or Iterable = 'all',
+                  max_cols: int = 3, plt_width: int = 3, plt_height: int = 3) -> None:
     """
     Generate figures of the clustered FDC's
 
     Args:
         workdir: path to the project directory
+        clusters: number of clusters to create figures for
+        max_cols: maximum number of columns (subplots) in the figure
+        plt_width: width of each subplot in inches
+        plt_height: height of each subplot in inches
 
     Returns:
         None
     """
-    # image generating params
-    img_width = 3
-    img_height = 3
-    max_cols = 3
-
-    # read the fdc's
     x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')).values
     size = x.shape[1]
     x_values = np.linspace(0, size, 5)
     x_ticks = np.linspace(0, 100, 5).astype(int)
 
-    for model_pickle in natsorted(glob.glob(os.path.join(workdir, 'kmeans_outputs', 'kmeans-*.pickle'))):
-        kmeans = joblib.load(model_pickle)
+    kmeans_dir = os.path.join(workdir, 'kmeans_outputs')
+    if clusters == 'all':
+        model_files = natsorted(glob.glob(os.path.join(kmeans_dir, 'kmeans-*.pickle')))
+    elif isinstance(clusters, int):
+        model_files = glob.glob(os.path.join(kmeans_dir, f'kmeans-{clusters}.pickle'))
+    elif isinstance(clusters, Iterable):
+        model_files = natsorted([os.path.join(kmeans_dir, f'kmeans-{i}.pickle') for i in clusters])
+    else:
+        raise TypeError('n_clusters should be of type int or an iterable')
+
+    for model_file in model_files:
+        # todo read the cluster centers from the parquet file instead of the model pickle
+        kmeans = joblib.load(model_file)
         n_clusters = int(kmeans.n_clusters)
         n_cols = min(n_clusters, max_cols)
         n_rows = math.ceil(n_clusters / n_cols)
 
-        fig, axs = plt.subplots(n_rows, n_cols, figsize=(img_width * n_cols + 1, img_height * n_rows + 1), dpi=800,
-                                squeeze=False, tight_layout=True, sharey=True)
+        fig, axs = plt.subplots(
+            n_rows,
+            n_cols,
+            figsize=(plt_width * n_cols + 1, plt_height * n_rows + 1),
+            dpi=500,
+            squeeze=False,
+            tight_layout=True,
+            sharey=True
+        )
         fig.suptitle("KMeans FDC Clustering")
         fig.supxlabel('Exceedance Probability (%)')
         fig.supylabel('Discharge Z-Score')
@@ -94,7 +158,7 @@ def plot(workdir: str) -> None:
             ax.set_xticks(x_values, x_ticks)
             ax.set_ylim(-2, 4)
             for j in x[kmeans.labels_ == i]:
-                ax.plot(j.ravel(), "k-", alpha=.15)
+                ax.plot(j.ravel(), "k-")
             ax.plot(kmeans.cluster_centers_[i].flatten(), "r-")
         # turn off plotting axes which are blank - made for the square grid but > n_clusters
         for ax in fig.axes[n_clusters:]:
@@ -105,7 +169,50 @@ def plot(workdir: str) -> None:
     return
 
 
-def summarize(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) -> pd.DataFrame:
+def plot_silhouette(workdir: str, clusters: int or Iterable = 'all',
+                    plt_width: int = 3, plt_height: int = 3) -> None:
+    """
+    Plot the silhouette scores for each cluster.
+    Based on https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
+
+    Args:
+        workdir: path to the project directory
+        clusters: number of clusters to create figures for
+        plt_width: width of each subplot in inches
+        plt_height: height of each subplot in inches
+
+    Returns:
+        None
+    """
+    kmeans_dir = os.path.join(workdir, 'kmeans_outputs')
+    if clusters == 'all':
+        sscore_files = natsorted(glob.glob(os.path.join(kmeans_dir, 'kmeans-*-silscores.parquet')))
+    elif isinstance(clusters, int):
+        sscore_files = glob.glob(os.path.join(kmeans_dir, f'kmeans-{clusters}-silscores.parquet'))
+    elif isinstance(clusters, Iterable):
+        sscore_files = natsorted([os.path.join(kmeans_dir, f'kmeans-{i}-silscores.parquet') for i in clusters])
+    else:
+        raise TypeError('n_clusters should be of type int or an iterable')
+
+    for sscore_file in sscore_files:
+        # todo plot the silhouette scores
+        n_clusters = int(os.path.basename(sscore_file).split('-')[1])
+        centers_df = pd.read_parquet(os.path.join(kmeans_dir, f'kmeans-{n_clusters}-centers.parquet'))
+        silscores_df = pd.read_parquet(sscore_file)
+
+        for cluster_num in centers_df.columns:
+            # Create a subplot with 1 row and 2 columns
+            fig, (ax1, ax2) = plt.subplots(
+                1,
+                2,
+                figsize=(plt_width * 2 + 1, plt_height + 1),
+                dpi=500,
+                squeeze=False,
+                tight_layout=True,
+            )
+
+
+def merge_assign_table(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) -> pd.DataFrame:
     """
     Creates a csv listing the streams assigned to each cluster in workdir/kmeans_models and also adds that information
     to assign_table.csv
@@ -118,6 +225,7 @@ def summarize(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None)
     Returns:
         None
     """
+    # todo move to assign module
     if n_clusters is None:
         # read the cluster results csv
         with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'r') as f:
@@ -125,7 +233,7 @@ def summarize(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None)
 
     # create a dataframe with the optimal model's labels and the model_id's
     df = pd.DataFrame({
-        'cluster': Clusterer.from_pickle(
+        'cluster': joblib.load(
             os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_.flatten(),
         mid_col: pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten()
     }, dtype=str)
diff --git a/saber/gis.py b/saber/gis.py
index 2328171..14d8cc3 100644
--- a/saber/gis.py
+++ b/saber/gis.py
@@ -1,24 +1,23 @@
 import os
 import warnings
 
+import contextily as cx
 import geopandas as gpd
+import matplotlib as mpl
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 
-from ._vocab import mid_col
 from ._vocab import gid_col
-from ._vocab import reason_col
 from ._vocab import metric_nc_name_list
-
-import matplotlib as mpl
-import matplotlib.pyplot as plt
-import contextily as cx
+from ._vocab import mid_col
+from ._vocab import reason_col
 
 __all__ = ['generate_all', 'clip_by_assignment', 'clip_by_cluster', 'clip_by_unassigned', 'clip_by_ids',
            'validation_maps']
 
 
-def generate_all(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None:
+def generate_all(workdir: str, assign_table: pd.DataFrame, drain_gis: str, prefix: str = '') -> None:
     """
     Runs all the clip functions which create subsets of the drainage lines GIS dataset based on how they were assigned
     for bias correction.
@@ -26,33 +25,33 @@ def generate_all(workdir: str, assign_table: pd.DataFrame, drain_shape: str, pre
     Args:
         workdir: the path to the working directory for the project
         assign_table: the assign_table dataframe
-        drain_shape: path to a drainage line shapefile which can be clipped
+        drain_gis: path to a drainage line shapefile which can be clipped
         prefix: a prefix for names of the outputs to distinguish between data generated in separate instances
 
     Returns:
         None
     """
-    clip_by_assignment(workdir, assign_table, drain_shape, prefix)
-    clip_by_cluster(workdir, assign_table, drain_shape, prefix)
-    clip_by_unassigned(workdir, assign_table, drain_shape, prefix)
+    clip_by_assignment(workdir, assign_table, drain_gis, prefix)
+    clip_by_cluster(workdir, assign_table, drain_gis, prefix)
+    clip_by_unassigned(workdir, assign_table, drain_gis, prefix)
     return
 
 
-def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None:
+def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_gis: str, prefix: str = '') -> None:
     """
     Creates Geopackage files in workdir/gis_outputs for each unique value in the assignment column
 
     Args:
         workdir: the path to the working directory for the project
         assign_table: the assign_table dataframe
-        drain_shape: path to a drainage line shapefile which can be clipped
+        drain_gis: path to a drainage line shapefile which can be clipped
         prefix: a prefix for names of the outputs to distinguish between data generated at separate instances
 
     Returns:
         None
     """
     # read the drainage line shapefile
-    dl = gpd.read_file(drain_shape)
+    dl = gpd.read_file(drain_gis)
     save_dir = os.path.join(workdir, 'gis_outputs')
 
     # get the unique list of assignment reasons
@@ -67,20 +66,20 @@ def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: st
     return
 
 
-def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None:
+def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_gis: str, prefix: str = '') -> None:
     """
     Creates Geopackage files in workdir/gis_outputs of the drainage lines based on the fdc cluster they were assigned to
 
     Args:
         workdir: the path to the working directory for the project
         assign_table: the assign_table dataframe
-        drain_shape: path to a drainage line shapefile which can be clipped
+        drain_gis: path to a drainage line shapefile which can be clipped
         prefix: optional, a prefix to prepend to each created file's name
 
     Returns:
         None
     """
-    dl_gdf = gpd.read_file(drain_shape)
+    dl_gdf = gpd.read_file(drain_gis)
     for num in sorted(set(assign_table[mid_col].dropna().values)):
         gdf = dl_gdf[dl_gdf[mid_col].isin(assign_table[assign_table[mid_col] == num][mid_col])]
         if gdf.empty:
@@ -89,20 +88,20 @@ def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_shape: str,
     return
 
 
-def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None:
+def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_gis: str, prefix: str = '') -> None:
     """
     Creates Geopackage files in workdir/gis_outputs of the drainage lines which haven't been assigned a gauge yet
 
     Args:
         workdir: the path to the working directory for the project
         assign_table: the assign_table dataframe
-        drain_shape: path to a drainage line shapefile which can be clipped
+        drain_gis: path to a drainage line shapefile which can be clipped
         prefix: optional, a prefix to prepend to each created file's name
 
     Returns:
         None
     """
-    dl_gdf = gpd.read_file(drain_shape)
+    dl_gdf = gpd.read_file(drain_gis)
     ids = assign_table[assign_table[reason_col].isna()][mid_col].values
     subset = dl_gdf[dl_gdf[mid_col].isin(ids)]
     if subset.empty:
@@ -113,29 +112,28 @@ def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_shape: st
     return
 
 
-def clip_by_ids(workdir: str, ids: list, drain_shape: str, prefix: str = '',
-                id_column: str = mid_col) -> None:
+def clip_by_ids(workdir: str, ids: list, drain_gis: str, prefix: str = '', id_column: str = mid_col) -> None:
     """
     Creates Geopackage files in workdir/gis_outputs of the subset of 'drain_shape' with an ID in the specified list
 
     Args:
         workdir: path to the project directory
         ids: any iterable containing a series of model_ids
-        drain_shape: path to the drainage shapefile to be clipped
+        drain_gis: path to the drainage shapefile to be clipped
         prefix: optional, a prefix to prepend to each created file's name
         id_column: name of the id column in the attributes of the shape table
 
     Returns:
         None
     """
-    dl = gpd.read_file(drain_shape)
+    dl = gpd.read_file(drain_gis)
     save_dir = os.path.join(workdir, 'gis_outputs')
     name = f'{prefix}{"_" if prefix else ""}id_subset.gpkg'
     dl[dl[id_column].isin(ids)].to_file(os.path.join(save_dir, name))
     return
 
 
-def validation_maps(workdir: str, gauge_shape: str, val_table: pd.DataFrame = None, prefix: str = '') -> None:
+def validation_maps(workdir: str, gauge_gis: str, val_table: pd.DataFrame = None, prefix: str = '') -> None:
     """
     Creates Geopackage files in workdir/gis_outputs of subsets of the gauge_shape.
     1 is the fill gauge shape with added attribute columns for all the computed stats. There are 2 for each of the 5
@@ -145,7 +143,7 @@ def validation_maps(workdir: str, gauge_shape: str, val_table: pd.DataFrame = No
     Args:
         workdir: path to the project directory
         val_table: the validation table produced by saber.validate
-        gauge_shape: path to the gauge locations shapefile
+        gauge_gis: path to the gauge locations shapefile
         prefix: optional, a prefix to prepend to each created file's name
 
     Returns:
@@ -156,7 +154,7 @@ def validation_maps(workdir: str, gauge_shape: str, val_table: pd.DataFrame = No
     save_dir = os.path.join(workdir, 'gis_outputs')
 
     # merge gauge table with the validation table
-    gdf = gpd.read_file(gauge_shape)
+    gdf = gpd.read_file(gauge_gis)
     gdf = gdf.merge(val_table, on=gid_col, how='inner')
     gdf.to_file(os.path.join(save_dir, 'gauges_with_validation_stats.gpkg'))
 
@@ -230,8 +228,7 @@ def histomaps(gdf: gpd.GeoDataFrame, metric: str, prct: str, workdir: str) -> No
     axm.set_xlabel('Longitude')
     axm.set_xticks([])
     axm.set_yticks([])
-    gdf[core_columns + [metric, ]].to_crs(epsg=3857).plot(
-        metric, ax=axm, cmap=cmap, norm=norm, legend=True, markersize=10)
+    gdf[core_columns + [metric, ]].to_crs(epsg=3857).plot(metric)
     cx.add_basemap(ax=axm, zoom=9, source=cx.providers.Esri.WorldTopoMap, attribution='')
 
     fig.savefig(os.path.join(workdir, 'gis_outputs', f'{metric}_{prct}.png'))
diff --git a/saber/prep.py b/saber/prep.py
index a40f332..c0af844 100755
--- a/saber/prep.py
+++ b/saber/prep.py
@@ -1,15 +1,15 @@
 import os
 
-import pandas as pd
 import geopandas as gpd
 import netCDF4 as nc
 import numpy as np
+import pandas as pd
 from sklearn.preprocessing import StandardScaler as Scalar
 
+from ._vocab import get_table_path
+from ._vocab import guess_hindcast_path
 from ._vocab import mid_col
 from ._vocab import read_drain_table
-from ._vocab import guess_hindcast_path
-from ._vocab import get_table_path
 
 __all__ = ['gis_tables', 'hindcast', 'workdir']
 
@@ -27,10 +27,16 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No
         None
     """
     if gauge_gis is not None:
-        pd.DataFrame(gpd.read_file(gauge_gis).drop('geometry', axis=1)).to_parquet(
-            os.path.join(workdir, 'tables', 'gauge_table.parquet'))
+        if drain_gis.endswith('.parquet'):
+            gdf = gpd.read_parquet(gauge_gis)
+        else:
+            gdf = gpd.read_file(gauge_gis)
+        pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet(os.path.join(workdir, 'tables', 'gauge_table.parquet'))
     if drain_gis is not None:
-        gdf = gpd.read_file(drain_gis)
+        if drain_gis.endswith('.parquet'):
+            gdf = gpd.read_parquet(drain_gis)
+        else:
+            gdf = gpd.read_file(drain_gis)
         gdf['centroid_x'] = gdf.geometry.centroid.x
         gdf['centroid_y'] = gdf.geometry.centroid.y
         gdf = gdf.drop('geometry', axis=1)
diff --git a/saber/table.py b/saber/table.py
index f1aafe4..a1eaae5 100644
--- a/saber/table.py
+++ b/saber/table.py
@@ -3,9 +3,9 @@
 import numpy as np
 import pandas as pd
 
-from ._vocab import mid_col
-from ._vocab import asgn_mid_col
 from ._vocab import asgn_gid_col
+from ._vocab import asgn_mid_col
+from ._vocab import mid_col
 from ._vocab import reason_col
 
 __all__ = ['get_path', 'read', 'cache', 'gen']
diff --git a/saber/validate.py b/saber/validate.py
index c27198f..1de3c9b 100644
--- a/saber/validate.py
+++ b/saber/validate.py
@@ -2,15 +2,14 @@
 import os
 import shutil
 
-import pandas as pd
 import netCDF4 as nc
 import numpy as np
+import pandas as pd
 
-from .prep import workdir
-from ._vocab import mid_col
+from ._vocab import cal_nc_name
 from ._vocab import gid_col
 from ._vocab import metric_nc_name_list
-from ._vocab import cal_nc_name
+from ._vocab import mid_col
 
 
 def sample_gauges(workdir: str, overwrite: bool = False) -> None:

From b03b98a0c92cb72aa7175a0d79665cb044857136 Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Thu, 25 Aug 2022 21:24:33 -0600
Subject: [PATCH 15/24] add silhouette plots

---
 saber/cluster.py | 94 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 62 insertions(+), 32 deletions(-)

diff --git a/saber/cluster.py b/saber/cluster.py
index b3c4f96..400f9ae 100755
--- a/saber/cluster.py
+++ b/saber/cluster.py
@@ -7,10 +7,11 @@
 import joblib
 import kneed
 import matplotlib.pyplot as plt
+import matplotlib.cm as cm
 import numpy as np
 import pandas as pd
 from natsort import natsorted
-from sklearn.cluster import KMeans as Clusterer
+from sklearn.cluster import KMeans
 from sklearn.metrics import silhouette_samples
 
 from ._vocab import cluster_count_file
@@ -37,7 +38,7 @@ def generate(workdir: str, max_clusters: int = 12) -> None:
     # build the kmeans model for a range of cluster numbers
     for n_clusters in range(2, max_clusters + 1):
         print(n_clusters)
-        kmeans = Clusterer(n_clusters=n_clusters)
+        kmeans = KMeans(n_clusters=n_clusters)
         kmeans.fit_predict(x)
         joblib.dump(kmeans, os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle'))
     return
@@ -69,7 +70,7 @@ def summarize(workdir: str) -> None:
         pd.DataFrame(
             np.transpose(kmeans.cluster_centers_),
             columns=np.array(range(n_clusters)).astype(str)
-        ).to_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}-centers.parquet'))
+        ).to_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-centers-{n_clusters}.parquet'))
 
         # save the silhouette score for each cluster
         silhouette_scores.append(silhouette_samples(x, kmeans.labels_).flatten())
@@ -153,7 +154,7 @@ def plot_clusters(workdir: str, clusters: int or Iterable = 'all',
         fig.supylabel('Discharge Z-Score')
 
         for i, ax in enumerate(fig.axes[:n_clusters]):
-            ax.set_title(f'Cluster {i + 1}')
+            ax.set_title(f'Cluster {i + 1} (n = {np.sum(kmeans.labels_ == i)})')
             ax.set_xlim(0, size)
             ax.set_xticks(x_values, x_ticks)
             ax.set_ylim(-2, 4)
@@ -164,52 +165,81 @@ def plot_clusters(workdir: str, clusters: int or Iterable = 'all',
         for ax in fig.axes[n_clusters:]:
             ax.axis('off')
 
-        fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.png'))
+        fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'kmeans-clusters-{n_clusters}.png'))
         plt.close(fig)
     return
 
 
-def plot_silhouette(workdir: str, clusters: int or Iterable = 'all',
-                    plt_width: int = 3, plt_height: int = 3) -> None:
+def plot_silhouette(workdir: str, plt_width: int = 3, plt_height: int = 3) -> None:
     """
     Plot the silhouette scores for each cluster.
     Based on https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
 
     Args:
         workdir: path to the project directory
-        clusters: number of clusters to create figures for
         plt_width: width of each subplot in inches
         plt_height: height of each subplot in inches
 
     Returns:
         None
     """
-    kmeans_dir = os.path.join(workdir, 'kmeans_outputs')
-    if clusters == 'all':
-        sscore_files = natsorted(glob.glob(os.path.join(kmeans_dir, 'kmeans-*-silscores.parquet')))
-    elif isinstance(clusters, int):
-        sscore_files = glob.glob(os.path.join(kmeans_dir, f'kmeans-{clusters}-silscores.parquet'))
-    elif isinstance(clusters, Iterable):
-        sscore_files = natsorted([os.path.join(kmeans_dir, f'kmeans-{i}-silscores.parquet') for i in clusters])
-    else:
-        raise TypeError('n_clusters should be of type int or an iterable')
+    labels_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-labels.parquet'))
+    silhouette_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-silhouette_scores.parquet'))
+
+    for tot_clusters in silhouette_df.columns:
+        centers_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{tot_clusters}-centers.parquet'))
+        # Create a subplot with 1 row and 2 columns
+        fig, (ax1, ax2) = plt.subplots(
+            nrows=1,
+            ncols=2,
+            figsize=(plt_width * 2 + 1, plt_height + 1),
+            dpi=500,
+            tight_layout=True,
+        )
 
-    for sscore_file in sscore_files:
-        # todo plot the silhouette scores
-        n_clusters = int(os.path.basename(sscore_file).split('-')[1])
-        centers_df = pd.read_parquet(os.path.join(kmeans_dir, f'kmeans-{n_clusters}-centers.parquet'))
-        silscores_df = pd.read_parquet(sscore_file)
-
-        for cluster_num in centers_df.columns:
-            # Create a subplot with 1 row and 2 columns
-            fig, (ax1, ax2) = plt.subplots(
-                1,
-                2,
-                figsize=(plt_width * 2 + 1, plt_height + 1),
-                dpi=500,
-                squeeze=False,
-                tight_layout=True,
+        ax1.set_title("Silhouette Plot")
+        ax1.set_xlabel("Silhouette Score")
+        ax1.set_ylabel("Cluster Label")
+        ax1.set_yticks([])  # Clear the yaxis labels / ticks
+        # ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
+
+        ax2.set_title("Cluster Centers")
+        ax2.set_xlabel("Exceedance Probability (%)")
+        ax2.set_ylabel("Discharge Z-Score")
+
+        # The vertical line for average silhouette score of all the values
+        ax1.axvline(x=silhouette_df[tot_clusters].mean(), color="red", linestyle="--")
+
+        y_lower = 10
+        for sub_cluster in range(int(tot_clusters)):
+            # select the rows applicable to the current sub cluster
+            cluster_silhouettes = silhouette_df[labels_df[tot_clusters] == sub_cluster][tot_clusters].values.flatten()
+            cluster_silhouettes.sort()
+
+            n = cluster_silhouettes.shape[0]
+            y_upper = y_lower + n
+
+            color = cm.nipy_spectral(sub_cluster / int(tot_clusters))
+            ax1.fill_betweenx(
+                np.arange(y_lower, y_upper),
+                0,
+                cluster_silhouettes,
+                facecolor=color,
+                edgecolor=color,
+                alpha=0.7,
             )
+            # Label the silhouette plots with their cluster numbers at the middle
+            ax1.text(-0.05, y_lower + 0.5 * n, str(sub_cluster + 1))
+
+            # plot the cluster center
+            ax2.plot(centers_df[f'{sub_cluster}'].values, alpha=0.7, c=color, label=f'Cluster {sub_cluster + 1}')
+
+            # add some buffer before the next cluster
+            y_lower = y_upper + 10
+
+        fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'kmeans-silhouettes-{tot_clusters}.png'))
+        plt.close(fig)
+    return
 
 
 def merge_assign_table(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) -> pd.DataFrame:

From 0fffc2c8ed994a9a203ea459bc4e6d011ac5c2ee Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Fri, 26 Aug 2022 00:05:52 -0600
Subject: [PATCH 16/24] merge assign and table modules

---
 examples/saber_example.py |   8 +--
 saber/__init__.py         |  15 ++---
 saber/assign.py           |  99 --------------------------------
 saber/cluster.py          |  40 ++-----------
 saber/table.py            | 118 +++++++++++++++++++++++++++++++++++++-
 5 files changed, 132 insertions(+), 148 deletions(-)
 delete mode 100644 saber/assign.py

diff --git a/examples/saber_example.py b/examples/saber_example.py
index 44c57a3..843f349 100644
--- a/examples/saber_example.py
+++ b/examples/saber_example.py
@@ -25,13 +25,13 @@
 print('Generate Clusters')
 saber.cluster.generate(workdir)
 saber.cluster.plot(workdir)
-saber.cluster.summarize(workdir, assign_table)
 
 # Assign basins which are gauged and propagate those gauges
 print('Making Assignments')
-assign_table = saber.assign.gauged(assign_table)
-assign_table = saber.assign.propagation(assign_table)
-assign_table = saber.assign.clusters_by_dist(assign_table)
+assign_table = saber.table.merge_clusters(workdir, assign_table, n_clusters=5)
+assign_table = saber.table.assign_gauged(assign_table)
+assign_table = saber.table.assign_propagation(assign_table)
+assign_table = saber.table.assign_by_distance(assign_table)
 
 # Cache the assignments table with the updates
 saber.table.cache(workdir, assign_table)
diff --git a/saber/__init__.py b/saber/__init__.py
index d6a5a09..d05d5dc 100644
--- a/saber/__init__.py
+++ b/saber/__init__.py
@@ -1,19 +1,14 @@
-from saber._calibrate import calibrate, calibrate_region
-
-import saber.table
-import saber.prep
+import saber.analysis
 import saber.cluster
-import saber.assign
 import saber.gis
+import saber.prep
+import saber.table
 import saber.utils
 import saber.validate
-import saber.analysis
-
-import saber._vocab
-
+from saber._calibrate import calibrate, calibrate_region
 
 __all__ = ['calibrate', 'calibrate_region',
-           'table', 'prep', 'assign', 'gis', 'cluster', 'utils', 'validate', 'analysis']
+           'table', 'prep', 'gis', 'cluster', 'utils', 'validate', 'analysis']
 __author__ = 'Riley Hales'
 __version__ = '0.4.0'
 __license__ = 'BSD 3 Clause Clear'
diff --git a/saber/assign.py b/saber/assign.py
deleted file mode 100644
index 9205793..0000000
--- a/saber/assign.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import numpy as np
-import pandas as pd
-
-from ._propagation import propagate_in_table
-from ._propagation import walk_downstream
-from ._propagation import walk_upstream
-from ._vocab import asgn_gid_col
-from ._vocab import asgn_mid_col
-from ._vocab import gid_col
-from ._vocab import mid_col
-from ._vocab import order_col
-from ._vocab import reason_col
-
-
-def gauged(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Assigns basins a gauge for correction which contain a gauge
-
-    Args:
-        df: the assignments table dataframe
-
-    Returns:
-        Copy of df with assignments made
-    """
-    _df = df.copy()
-    selector = ~_df[gid_col].isna()
-    _df.loc[selector, asgn_mid_col] = _df[mid_col]
-    _df.loc[selector, asgn_gid_col] = _df[gid_col]
-    _df.loc[selector, reason_col] = 'gauged'
-    return _df
-
-
-def propagation(df: pd.DataFrame, max_prop: int = 5) -> pd.DataFrame:
-    """
-
-    Args:
-        df: the assignments table dataframe
-        max_prop: the max number of stream segments to propagate downstream
-
-    Returns:
-        Copy of df with assignments made
-    """
-    _df = df.copy()
-    for gauged_stream in _df.loc[~_df[gid_col].isna(), mid_col]:
-        subset = _df.loc[_df[mid_col] == gauged_stream, gid_col]
-        if subset.empty:
-            continue
-        start_gid = subset.values[0]
-        connected_segments = walk_upstream(df, gauged_stream, same_order=True)
-        _df = propagate_in_table(_df, gauged_stream, start_gid, connected_segments, max_prop, 'upstream')
-        connected_segments = walk_downstream(df, gauged_stream, same_order=True)
-        _df = propagate_in_table(_df, gauged_stream, start_gid, connected_segments, max_prop, 'downstream')
-
-    return _df
-
-
-def clusters_by_dist(df: pd.DataFrame) -> pd.DataFrame:
-    """
-    Assigns all possible ungauged basins a gauge that is
-        (1) is closer than any other gauge
-        (2) is of same stream order as ungauged basin
-        (3) in the same simulated fdc cluster as ungauged basin
-
-    Args:
-        df: the assignments table dataframe
-
-    Returns:
-        Copy of df with assignments made
-    """
-    _df = df.copy()
-    # first filter by cluster number
-    for c_num in sorted(set(_df['sim-fdc-cluster'].values)):
-        c_sub = _df[_df['sim-fdc-cluster'] == c_num]
-        # next filter by stream order
-        for so_num in sorted(set(c_sub[order_col])):
-            c_so_sub = c_sub[c_sub[order_col] == so_num]
-
-            # determine which ids **need** to be assigned
-            ids_to_assign = c_so_sub[c_so_sub[asgn_mid_col].isna()][mid_col].values
-            avail_assigns = c_so_sub[c_so_sub[asgn_mid_col].notna()]
-            if ids_to_assign.size == 0 or avail_assigns.empty:
-                print(f'unable to assign cluster {c_num} to stream order {so_num}')
-                continue
-            # now you find the closest gauge to each unassigned
-            for id in ids_to_assign:
-                subset = c_so_sub.loc[c_so_sub[mid_col] == id, ['x', 'y']]
-
-                dx = avail_assigns.x.values - subset.x.values
-                dy = avail_assigns.y.values - subset.y.values
-                avail_assigns['dist'] = np.sqrt(dx * dx + dy * dy)
-                row_idx_to_assign = avail_assigns['dist'].idxmin()
-
-                mid_to_assign = avail_assigns.loc[row_idx_to_assign].assigned_model_id
-                gid_to_assign = avail_assigns.loc[row_idx_to_assign].assigned_gauge_id
-
-                _df.loc[_df[mid_col] == id, [asgn_mid_col, asgn_gid_col, reason_col]] = \
-                    [mid_to_assign, gid_to_assign, f'cluster-{c_num}-dist']
-
-    return _df
diff --git a/saber/cluster.py b/saber/cluster.py
index 400f9ae..fb002af 100755
--- a/saber/cluster.py
+++ b/saber/cluster.py
@@ -65,6 +65,7 @@ def summarize(workdir: str) -> None:
     for model_file in natsorted(glob.glob(os.path.join(workdir, 'kmeans_outputs', 'kmeans-*.pickle'))):
         kmeans = joblib.load(model_file)
         n_clusters = int(kmeans.n_clusters)
+        print(n_clusters)
 
         # save the cluster centroids to table - columns are the cluster number, rows are the centroid FDC values
         pd.DataFrame(
@@ -161,7 +162,7 @@ def plot_clusters(workdir: str, clusters: int or Iterable = 'all',
             for j in x[kmeans.labels_ == i]:
                 ax.plot(j.ravel(), "k-")
             ax.plot(kmeans.cluster_centers_[i].flatten(), "r-")
-        # turn off plotting axes which are blank - made for the square grid but > n_clusters
+        # turn off plotting axes which are blank (when ax number > n_clusters)
         for ax in fig.axes[n_clusters:]:
             ax.axis('off')
 
@@ -187,8 +188,7 @@ def plot_silhouette(workdir: str, plt_width: int = 3, plt_height: int = 3) -> No
     silhouette_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-silhouette_scores.parquet'))
 
     for tot_clusters in silhouette_df.columns:
-        centers_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{tot_clusters}-centers.parquet'))
-        # Create a subplot with 1 row and 2 columns
+        centers_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-centers-{tot_clusters}.parquet'))
         fig, (ax1, ax2) = plt.subplots(
             nrows=1,
             ncols=2,
@@ -197,12 +197,14 @@ def plot_silhouette(workdir: str, plt_width: int = 3, plt_height: int = 3) -> No
             tight_layout=True,
         )
 
+        # Plot 1 titles and labels
         ax1.set_title("Silhouette Plot")
         ax1.set_xlabel("Silhouette Score")
         ax1.set_ylabel("Cluster Label")
         ax1.set_yticks([])  # Clear the yaxis labels / ticks
-        # ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
+        ax1.set_xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])
 
+        # Plot 2 titles and labels
         ax2.set_title("Cluster Centers")
         ax2.set_xlabel("Exceedance Probability (%)")
         ax2.set_ylabel("Discharge Z-Score")
@@ -240,33 +242,3 @@ def plot_silhouette(workdir: str, plt_width: int = 3, plt_height: int = 3) -> No
         fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'kmeans-silhouettes-{tot_clusters}.png'))
         plt.close(fig)
     return
-
-
-def merge_assign_table(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) -> pd.DataFrame:
-    """
-    Creates a csv listing the streams assigned to each cluster in workdir/kmeans_models and also adds that information
-    to assign_table.csv
-
-    Args:
-        workdir: path to the project directory
-        assign_table: the assignment table DataFrame
-        n_clusters: number of clusters to use when applying the labels to the assign_table
-
-    Returns:
-        None
-    """
-    # todo move to assign module
-    if n_clusters is None:
-        # read the cluster results csv
-        with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'r') as f:
-            n_clusters = int(json.loads(f.read())['historical'])
-
-    # create a dataframe with the optimal model's labels and the model_id's
-    df = pd.DataFrame({
-        'cluster': joblib.load(
-            os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_.flatten(),
-        mid_col: pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten()
-    }, dtype=str)
-
-    # merge the dataframes
-    return assign_table.merge(df, how='outer', on=mid_col)
diff --git a/saber/table.py b/saber/table.py
index a1eaae5..9edaa33 100644
--- a/saber/table.py
+++ b/saber/table.py
@@ -1,14 +1,21 @@
 import os
 
+import joblib
 import numpy as np
 import pandas as pd
 
+from ._propagation import propagate_in_table
+from ._propagation import walk_downstream
+from ._propagation import walk_upstream
 from ._vocab import asgn_gid_col
 from ._vocab import asgn_mid_col
+from ._vocab import gid_col
 from ._vocab import mid_col
+from ._vocab import order_col
 from ._vocab import reason_col
 
-__all__ = ['get_path', 'read', 'cache', 'gen']
+__all__ = ['get_path', 'gen', 'read', 'cache', 'merge_clusters', 'assign_gauged', 'assign_propagation',
+           'assign_by_distance', ]
 
 
 def get_path(workdir: str) -> str:
@@ -78,3 +85,112 @@ def cache(workdir: str, assign_table: pd.DataFrame) -> None:
     assign_table.to_parquet(get_path(workdir))
     return
 
+
+def merge_clusters(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) -> pd.DataFrame:
+    """
+    Creates a csv listing the streams assigned to each cluster in workdir/kmeans_models and also adds that information
+    to assign_table.csv
+
+    Args:
+        workdir: path to the project directory
+        assign_table: the assignment table DataFrame
+        n_clusters: number of clusters to use when applying the labels to the assign_table
+
+    Returns:
+        None
+    """
+    # create a dataframe with the optimal model's labels and the model_id's
+    df = pd.DataFrame({
+        'cluster': joblib.load(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_,
+        mid_col: pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten()
+    }, dtype=str)
+
+    # merge the dataframes
+    return assign_table.merge(df, how='outer', on=mid_col)
+
+
+def assign_gauged(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Assigns basins a gauge for correction which contain a gauge
+
+    Args:
+        df: the assignments table dataframe
+
+    Returns:
+        Copy of df with assignments made
+    """
+    _df = df.copy()
+    selector = ~_df[gid_col].isna()
+    _df.loc[selector, asgn_mid_col] = _df[mid_col]
+    _df.loc[selector, asgn_gid_col] = _df[gid_col]
+    _df.loc[selector, reason_col] = 'gauged'
+    return _df
+
+
+def assign_propagation(df: pd.DataFrame, max_prop: int = 5) -> pd.DataFrame:
+    """
+
+    Args:
+        df: the assignments table dataframe
+        max_prop: the max number of stream segments to propagate downstream
+
+    Returns:
+        Copy of df with assignments made
+    """
+    _df = df.copy()
+    for gauged_stream in _df.loc[~_df[gid_col].isna(), mid_col]:
+        subset = _df.loc[_df[mid_col] == gauged_stream, gid_col]
+        if subset.empty:
+            continue
+        start_gid = subset.values[0]
+        connected_segments = walk_upstream(df, gauged_stream, same_order=True)
+        _df = propagate_in_table(_df, gauged_stream, start_gid, connected_segments, max_prop, 'upstream')
+        connected_segments = walk_downstream(df, gauged_stream, same_order=True)
+        _df = propagate_in_table(_df, gauged_stream, start_gid, connected_segments, max_prop, 'downstream')
+
+    return _df
+
+
+def assign_by_distance(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Assigns all possible ungauged basins a gauge that is
+        (1) is closer than any other gauge
+        (2) is of same stream order as ungauged basin
+        (3) in the same simulated fdc cluster as ungauged basin
+
+    Args:
+        df: the assignments table dataframe
+
+    Returns:
+        Copy of df with assignments made
+    """
+    _df = df.copy()
+    # first filter by cluster number
+    for c_num in sorted(set(_df['sim-fdc-cluster'].values)):
+        c_sub = _df[_df['sim-fdc-cluster'] == c_num]
+        # next filter by stream order
+        for so_num in sorted(set(c_sub[order_col])):
+            c_so_sub = c_sub[c_sub[order_col] == so_num]
+
+            # determine which ids **need** to be assigned
+            ids_to_assign = c_so_sub[c_so_sub[asgn_mid_col].isna()][mid_col].values
+            avail_assigns = c_so_sub[c_so_sub[asgn_mid_col].notna()]
+            if ids_to_assign.size == 0 or avail_assigns.empty:
+                print(f'unable to assign cluster {c_num} to stream order {so_num}')
+                continue
+            # now you find the closest gauge to each unassigned
+            for id in ids_to_assign:
+                subset = c_so_sub.loc[c_so_sub[mid_col] == id, ['x', 'y']]
+
+                dx = avail_assigns.x.values - subset.x.values
+                dy = avail_assigns.y.values - subset.y.values
+                avail_assigns['dist'] = np.sqrt(dx * dx + dy * dy)
+                row_idx_to_assign = avail_assigns['dist'].idxmin()
+
+                mid_to_assign = avail_assigns.loc[row_idx_to_assign].assigned_model_id
+                gid_to_assign = avail_assigns.loc[row_idx_to_assign].assigned_gauge_id
+
+                _df.loc[_df[mid_col] == id, [asgn_mid_col, asgn_gid_col, reason_col]] = \
+                    [mid_to_assign, gid_to_assign, f'cluster-{c_num}-dist']
+
+    return _df

From c57a5fdb6d2ef314ccad24ec5b26efb169a3ddf2 Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Fri, 26 Aug 2022 00:09:40 -0600
Subject: [PATCH 17/24] add joblib to dependency lists

---
 conda-environment.yml | 1 +
 requirements.txt      | 3 ++-
 saber/cluster.py      | 5 ++---
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/conda-environment.yml b/conda-environment.yml
index 5111979..725c053 100644
--- a/conda-environment.yml
+++ b/conda-environment.yml
@@ -17,3 +17,4 @@ dependencies:
   - scipy
   - natsort
   - scikit-learn
+  - joblib
diff --git a/requirements.txt b/requirements.txt
index 7d43255..67c3de9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,4 +11,5 @@ matplotlib
 requests
 scipy
 natsort
-scikit-learn
\ No newline at end of file
+scikit-learn
+joblib
\ No newline at end of file
diff --git a/saber/cluster.py b/saber/cluster.py
index fb002af..e93a1c6 100755
--- a/saber/cluster.py
+++ b/saber/cluster.py
@@ -6,8 +6,8 @@
 
 import joblib
 import kneed
-import matplotlib.pyplot as plt
 import matplotlib.cm as cm
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 from natsort import natsorted
@@ -15,9 +15,8 @@
 from sklearn.metrics import silhouette_samples
 
 from ._vocab import cluster_count_file
-from ._vocab import mid_col
 
-__all__ = ['generate', 'summarize', 'plot_clusters', 'plot_silhouette', 'merge_assign_table']
+__all__ = ['generate', 'summarize', 'plot_clusters', 'plot_silhouette']
 
 
 def generate(workdir: str, max_clusters: int = 12) -> None:

From 5198270e1acb4a73e014bea728b9c834456456c2 Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Fri, 26 Aug 2022 00:11:03 -0600
Subject: [PATCH 18/24] drop analysis module with outdated plot function

---
 saber/__init__.py |  3 +-
 saber/analysis.py | 72 -----------------------------------------------
 2 files changed, 1 insertion(+), 74 deletions(-)
 delete mode 100644 saber/analysis.py

diff --git a/saber/__init__.py b/saber/__init__.py
index d05d5dc..1b0ea72 100644
--- a/saber/__init__.py
+++ b/saber/__init__.py
@@ -1,4 +1,3 @@
-import saber.analysis
 import saber.cluster
 import saber.gis
 import saber.prep
@@ -8,7 +7,7 @@
 from saber._calibrate import calibrate, calibrate_region
 
 __all__ = ['calibrate', 'calibrate_region',
-           'table', 'prep', 'gis', 'cluster', 'utils', 'validate', 'analysis']
+           'table', 'prep', 'gis', 'cluster', 'utils', 'validate']
 __author__ = 'Riley Hales'
 __version__ = '0.4.0'
 __license__ = 'BSD 3 Clause Clear'
diff --git a/saber/analysis.py b/saber/analysis.py
deleted file mode 100644
index 582fa3d..0000000
--- a/saber/analysis.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import os
-
-import pandas as pd
-import plotly.graph_objects as go
-
-
-def plot(workdir, obs_data_dir, model_id) -> go.Figure:
-    files = [os.path.join(workdir, 'calibrated_simulated_flow.nc'), ]
-    variables = ('flow_sim', 'flow_bc')
-    dim_order = ('time', 'model_id')
-    a = grids.TimeSeries(files, variables, dim_order)
-    ts = a.point(None, model_id)
-    obs = pd.read_csv(os.path.join(obs_data_dir, ))
-    a = go.Figure(
-        [
-            go.Scatter(
-                name='Original Simulation',
-                x=ts.index,
-                y=ts['flow_sim']
-            ),
-            go.Scatter(
-                name='Adjusted Simulation',
-                x=ts.index,
-                y=ts['flow_bc']
-            ),
-            go.Scatter(
-                name='Adjusted Simulation',
-                x=ts.index,
-                y=ts['flow_bc']
-            )
-        ]
-    )
-    a.show()
-    return
-
-
-def plot_results(sim, obs, bc, bcp, title):
-    sim_dates = sim.index.tolist()
-    scatters = [
-        go.Scatter(
-            name='Propagated Corrected (Experimental)',
-            x=bcp.index.tolist(),
-            y=bcp['Propagated Corrected Streamflow'].values.flatten(),
-        ),
-        go.Scatter(
-            name='Bias Corrected (Jorges Method)',
-            x=sim_dates,
-            y=bc.values.flatten(),
-        ),
-        go.Scatter(
-            name='Simulated (ERA 5)',
-            x=sim_dates,
-            y=sim.values.flatten(),
-        ),
-        go.Scatter(
-            name='Observed',
-            x=obs.index.tolist(),
-            y=obs.values.flatten(),
-        ),
-        go.Scatter(
-            name='Percentile',
-            x=sim_dates,
-            y=bcp['Percentile'].values.flatten(),
-        ),
-        go.Scatter(
-            name='Scalar',
-            x=sim_dates,
-            y=bcp['Scalars'].values.flatten(),
-        ),
-    ]
-    a = go.Figure(scatters, layout={'title': title})
-    return a

From 1578cc726e1fd54c9ed819cdbc962583af483831 Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Sat, 27 Aug 2022 20:41:56 -0600
Subject: [PATCH 19/24] use the vocab module for table names, paths, formats

---
 saber/_calibrate.py |  5 ++--
 saber/_vocab.py     | 70 +++++++++++++++++++++++++++++----------------
 saber/cluster.py    | 35 +++++++++++------------
 saber/gis.py        | 12 ++++----
 saber/prep.py       | 35 +++++++++++------------
 saber/table.py      |  6 ++--
 saber/validate.py   | 18 ++++++------
 7 files changed, 100 insertions(+), 81 deletions(-)

diff --git a/saber/_calibrate.py b/saber/_calibrate.py
index 4b848dd..037a642 100644
--- a/saber/_calibrate.py
+++ b/saber/_calibrate.py
@@ -15,7 +15,7 @@
 from ._vocab import asgn_gid_col
 from ._vocab import metric_list
 from ._vocab import metric_nc_name_list
-from ._vocab import hindcast_table
+from ._vocab import table_hindcast
 from ._vocab import cal_nc_name
 
 
@@ -171,13 +171,14 @@ def calibrate_region(workdir: str, assign_table: pd.DataFrame,
     Returns:
         None
     """
+    # todo create a parquet instead of a netcdf
     if gauge_table is None:
         gauge_table = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'gauge_table.csv'))
     if obs_data_dir is None:
         obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs')
 
     bcs_nc_path = os.path.join(workdir, cal_nc_name)
-    ts = pd.read_pickle(os.path.join(workdir, 'data_processed', hindcast_table))
+    ts = pd.read_pickle(os.path.join(workdir, 'data_processed', table_hindcast))
 
     # create the new netcdf
     bcs_nc = nc.Dataset(bcs_nc_path, 'w')
diff --git a/saber/_vocab.py b/saber/_vocab.py
index 43d130b..4c293f4 100644
--- a/saber/_vocab.py
+++ b/saber/_vocab.py
@@ -1,8 +1,7 @@
 import os
-import glob
+
 import pandas as pd
 
-# todo enforce using this module for retrieving names??
 
 # assign table and gis_input file required column names
 mid_col = 'model_id'
@@ -16,41 +15,64 @@
 
 # scaffolded folders
 tables = 'tables'
-data_in = 'data_inputs'
-gis_out = 'gis_outputs'
-kmeans_out = 'kmeans_outputs'
+gis_out = 'gis'
+kmeans_out = 'clusters'
 
 # name of some files produced by the algorithm
 cluster_count_file = 'best-fit-cluster-count.json'
 cal_nc_name = 'calibrated_simulated_flow.nc'
-hindcast_table = 'hindcast_series_table.parquet'
-hindcast_fdc_table = 'hindcast_fdc_table.parquet'
-drain_table = 'drain_table.parquet'
-gauge_table = 'gauge_table.parquet'
+
+# name of the required input tables and the outputs
+table_hindcast = 'hindcast.parquet'
+table_hindcast_fdc = 'hindcast_fdc.parquet'
+table_hindcast_fdc_trans = 'hindcast_fdc_transformed.parquet'
+table_mids = 'mid_table.parquet'
+table_drain = 'drain_table.parquet'
+table_gauge = 'gauge_table.parquet'
 
 # metrics computed on validation sets
 metric_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE (2012)']
 metric_nc_name_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE2012']
 
 
-def read_drain_table(workdir: str) -> pd.DataFrame:
-    return pd.read_parquet(os.path.join(workdir, tables, drain_table))
-
-
-def read_gauge_table(workdir: str) -> pd.DataFrame:
-    return pd.read_parquet(os.path.join(workdir, tables, gauge_table))
-
-
 def get_table_path(workdir: str, table_name: str) -> str:
-    if table_name == 'hindcast_series':
-        return os.path.join(workdir, tables, hindcast_table)
+    if table_name == 'hindcast':
+        return os.path.join(workdir, tables, table_hindcast)
     elif table_name == 'hindcast_fdc':
-        return os.path.join(workdir, tables, hindcast_fdc_table)
+        return os.path.join(workdir, tables, table_hindcast_fdc)
+    elif table_name == 'hindcast_fdc_trans':
+        return os.path.join(workdir, tables, table_hindcast_fdc)
+    elif table_name == 'mid_list':
+        return os.path.join(workdir, tables, table_mids)
     elif table_name == 'drain_table':
-        return os.path.join(workdir, tables, drain_table)
+        return os.path.join(workdir, tables, table_drain)
     elif table_name == 'gauge_table':
-        return os.path.join(workdir, tables, gauge_table)
+        return os.path.join(workdir, tables, table_gauge)
+    else:
+        raise ValueError(f'Unknown table name: {table_name}')
+
+
+def read_table(workdir: str, table_name: str) -> pd.DataFrame:
+    table_path = get_table_path(workdir, table_name)
+    table_format = os.path.splitext(table_path)[-1]
+    if table_format == 'parquet':
+        return pd.read_parquet(table_path)
+    elif table_format == 'feather':
+        return pd.read_feather(table_path)
+    elif table_format == 'csv':
+        return pd.read_csv(table_path)
+    else:
+        raise ValueError(f'Unknown table format: {table_format}')
 
 
-def guess_hindcast_path(workdir: str) -> str:
-    return glob.glob(os.path.join(workdir, data_in, '*.nc*'))[0]
+def write_table(table: pd.DataFrame, workdir: str, table_name: str) -> None:
+    table_path = get_table_path(workdir, table_name)
+    table_format = os.path.splitext(table_path)[-1]
+    if table_format == 'parquet':
+        table.to_parquet(table_path)
+    elif table_format == 'feather':
+        table.to_feather(table_path)
+    elif table_format == 'csv':
+        table.to_csv(table_path)
+    else:
+        raise ValueError(f'Unknown table format: {table_format}')
diff --git a/saber/cluster.py b/saber/cluster.py
index e93a1c6..b26318e 100755
--- a/saber/cluster.py
+++ b/saber/cluster.py
@@ -15,6 +15,7 @@
 from sklearn.metrics import silhouette_samples
 
 from ._vocab import cluster_count_file
+from ._vocab import read_table
 
 __all__ = ['generate', 'summarize', 'plot_clusters', 'plot_silhouette']
 
@@ -31,15 +32,14 @@ def generate(workdir: str, max_clusters: int = 12) -> None:
         None
     """
     # read the prepared data (array x)
-    x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet'))
+    x = read_table(workdir, 'hindcast_fdc_trans')
     x = x.values
 
     # build the kmeans model for a range of cluster numbers
     for n_clusters in range(2, max_clusters + 1):
-        print(n_clusters)
         kmeans = KMeans(n_clusters=n_clusters)
         kmeans.fit_predict(x)
-        joblib.dump(kmeans, os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle'))
+        joblib.dump(kmeans, os.path.join(workdir, 'clusters', f'kmeans-{n_clusters}.pickle'))
     return
 
 
@@ -58,10 +58,10 @@ def summarize(workdir: str) -> None:
     labels = []
 
     # read the prepared data (array x)
-    x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet'))
+    x = read_table(workdir, 'hindcast_fdc_trans')
     x = x.values
 
-    for model_file in natsorted(glob.glob(os.path.join(workdir, 'kmeans_outputs', 'kmeans-*.pickle'))):
+    for model_file in natsorted(glob.glob(os.path.join(workdir, 'clusters', 'kmeans-*.pickle'))):
         kmeans = joblib.load(model_file)
         n_clusters = int(kmeans.n_clusters)
         print(n_clusters)
@@ -70,7 +70,7 @@ def summarize(workdir: str) -> None:
         pd.DataFrame(
             np.transpose(kmeans.cluster_centers_),
             columns=np.array(range(n_clusters)).astype(str)
-        ).to_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-centers-{n_clusters}.parquet'))
+        ).to_parquet(os.path.join(workdir, 'clusters', f'kmeans-centers-{n_clusters}.parquet'))
 
         # save the silhouette score for each cluster
         silhouette_scores.append(silhouette_samples(x, kmeans.labels_).flatten())
@@ -84,21 +84,21 @@ def summarize(workdir: str) -> None:
 
     # save the summary results as a csv
     pd.DataFrame.from_dict(summary) \
-        .to_csv(os.path.join(workdir, 'kmeans_outputs', f'clustering-summary-stats.csv'))
+        .to_csv(os.path.join(workdir, 'clusters', f'clustering-summary-stats.csv'))
     # save the silhouette scores as a parquet
     silhouette_scores = np.transpose(np.array(silhouette_scores))
     pd.DataFrame(silhouette_scores, columns=np.array(range(2, silhouette_scores.shape[1] + 2)).astype(str)) \
-        .to_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-silhouette_scores.parquet'))
+        .to_parquet(os.path.join(workdir, 'clusters', 'kmeans-silhouette_scores.parquet'))
     # save the labels as a parquet
     labels = np.transpose(np.array(labels))
     pd.DataFrame(labels, columns=np.array(range(2, labels.shape[1] + 2)).astype(str)) \
-        .to_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-labels.parquet'))
+        .to_parquet(os.path.join(workdir, 'clusters', 'kmeans-labels.parquet'))
 
     # find the knee/elbow
     knee = kneed.KneeLocator(summary['number'], summary['inertia'], curve='convex', direction='decreasing').knee
 
     # save the best fitting cluster counts to a csv
-    with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'w') as f:
+    with open(os.path.join(workdir, 'clusters', cluster_count_file), 'w') as f:
         f.write(json.dumps({'historical': int(knee)}))
     return
 
@@ -118,12 +118,12 @@ def plot_clusters(workdir: str, clusters: int or Iterable = 'all',
     Returns:
         None
     """
-    x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')).values
+    x = read_table(workdir, 'hindcast_fdc_trans').values
     size = x.shape[1]
     x_values = np.linspace(0, size, 5)
     x_ticks = np.linspace(0, 100, 5).astype(int)
 
-    kmeans_dir = os.path.join(workdir, 'kmeans_outputs')
+    kmeans_dir = os.path.join(workdir, 'clusters')
     if clusters == 'all':
         model_files = natsorted(glob.glob(os.path.join(kmeans_dir, 'kmeans-*.pickle')))
     elif isinstance(clusters, int):
@@ -134,7 +134,6 @@ def plot_clusters(workdir: str, clusters: int or Iterable = 'all',
         raise TypeError('n_clusters should be of type int or an iterable')
 
     for model_file in model_files:
-        # todo read the cluster centers from the parquet file instead of the model pickle
         kmeans = joblib.load(model_file)
         n_clusters = int(kmeans.n_clusters)
         n_cols = min(n_clusters, max_cols)
@@ -165,7 +164,7 @@ def plot_clusters(workdir: str, clusters: int or Iterable = 'all',
         for ax in fig.axes[n_clusters:]:
             ax.axis('off')
 
-        fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'kmeans-clusters-{n_clusters}.png'))
+        fig.savefig(os.path.join(workdir, 'clusters', f'kmeans-clusters-{n_clusters}.png'))
         plt.close(fig)
     return
 
@@ -183,11 +182,11 @@ def plot_silhouette(workdir: str, plt_width: int = 3, plt_height: int = 3) -> No
     Returns:
         None
     """
-    labels_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-labels.parquet'))
-    silhouette_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-silhouette_scores.parquet'))
+    labels_df = pd.read_parquet(os.path.join(workdir, 'clusters', 'kmeans-labels.parquet'))
+    silhouette_df = pd.read_parquet(os.path.join(workdir, 'clusters', 'kmeans-silhouette_scores.parquet'))
 
     for tot_clusters in silhouette_df.columns:
-        centers_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-centers-{tot_clusters}.parquet'))
+        centers_df = pd.read_parquet(os.path.join(workdir, 'clusters', f'kmeans-centers-{tot_clusters}.parquet'))
         fig, (ax1, ax2) = plt.subplots(
             nrows=1,
             ncols=2,
@@ -238,6 +237,6 @@ def plot_silhouette(workdir: str, plt_width: int = 3, plt_height: int = 3) -> No
             # add some buffer before the next cluster
             y_lower = y_upper + 10
 
-        fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'kmeans-silhouettes-{tot_clusters}.png'))
+        fig.savefig(os.path.join(workdir, 'clusters', f'kmeans-silhouettes-{tot_clusters}.png'))
         plt.close(fig)
     return
diff --git a/saber/gis.py b/saber/gis.py
index 14d8cc3..e90fa85 100644
--- a/saber/gis.py
+++ b/saber/gis.py
@@ -52,7 +52,7 @@ def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_gis: str,
     """
     # read the drainage line shapefile
     dl = gpd.read_file(drain_gis)
-    save_dir = os.path.join(workdir, 'gis_outputs')
+    save_dir = os.path.join(workdir, 'gis')
 
     # get the unique list of assignment reasons
     for reason in set(assign_table[reason_col].dropna().values):
@@ -84,7 +84,7 @@ def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_gis: str, pr
         gdf = dl_gdf[dl_gdf[mid_col].isin(assign_table[assign_table[mid_col] == num][mid_col])]
         if gdf.empty:
             continue
-        gdf.to_file(os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}-{int(num)}.gpkg'))
+        gdf.to_file(os.path.join(workdir, 'gis', f'{prefix}{"_" if prefix else ""}-{int(num)}.gpkg'))
     return
 
 
@@ -107,7 +107,7 @@ def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_gis: str,
     if subset.empty:
         warnings.warn('Empty filter: No streams are unassigned')
         return
-    savepath = os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}assignments_unassigned.gpkg')
+    savepath = os.path.join(workdir, 'gis', f'{prefix}{"_" if prefix else ""}assignments_unassigned.gpkg')
     subset.to_file(savepath)
     return
 
@@ -127,7 +127,7 @@ def clip_by_ids(workdir: str, ids: list, drain_gis: str, prefix: str = '', id_co
         None
     """
     dl = gpd.read_file(drain_gis)
-    save_dir = os.path.join(workdir, 'gis_outputs')
+    save_dir = os.path.join(workdir, 'gis')
     name = f'{prefix}{"_" if prefix else ""}id_subset.gpkg'
     dl[dl[id_column].isin(ids)].to_file(os.path.join(save_dir, name))
     return
@@ -151,7 +151,7 @@ def validation_maps(workdir: str, gauge_gis: str, val_table: pd.DataFrame = None
     """
     if val_table is None:
         val_table = pd.read_csv(os.path.join(workdir, 'validation_runs', 'val_table.csv'))
-    save_dir = os.path.join(workdir, 'gis_outputs')
+    save_dir = os.path.join(workdir, 'gis')
 
     # merge gauge table with the validation table
     gdf = gpd.read_file(gauge_gis)
@@ -231,5 +231,5 @@ def histomaps(gdf: gpd.GeoDataFrame, metric: str, prct: str, workdir: str) -> No
     gdf[core_columns + [metric, ]].to_crs(epsg=3857).plot(metric)
     cx.add_basemap(ax=axm, zoom=9, source=cx.providers.Esri.WorldTopoMap, attribution='')
 
-    fig.savefig(os.path.join(workdir, 'gis_outputs', f'{metric}_{prct}.png'))
+    fig.savefig(os.path.join(workdir, 'gis', f'{metric}_{prct}.png'))
     return
diff --git a/saber/prep.py b/saber/prep.py
index c0af844..01e36f2 100755
--- a/saber/prep.py
+++ b/saber/prep.py
@@ -6,12 +6,11 @@
 import pandas as pd
 from sklearn.preprocessing import StandardScaler as Scalar
 
-from ._vocab import get_table_path
-from ._vocab import guess_hindcast_path
 from ._vocab import mid_col
-from ._vocab import read_drain_table
+from ._vocab import read_table
+from ._vocab import write_table
 
-__all__ = ['gis_tables', 'hindcast', 'workdir']
+__all__ = ['gis_tables', 'hindcast', 'scaffold']
 
 
 def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> None:
@@ -27,11 +26,12 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No
         None
     """
     if gauge_gis is not None:
-        if drain_gis.endswith('.parquet'):
+        if gauge_gis.endswith('.parquet'):
             gdf = gpd.read_parquet(gauge_gis)
         else:
             gdf = gpd.read_file(gauge_gis)
-        pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet(os.path.join(workdir, 'tables', 'gauge_table.parquet'))
+        write_table(pd.DataFrame(gdf.drop('geometry', axis=1)), workdir, 'gauge_table')
+
     if drain_gis is not None:
         if drain_gis.endswith('.parquet'):
             gdf = gpd.read_parquet(drain_gis)
@@ -40,11 +40,11 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No
         gdf['centroid_x'] = gdf.geometry.centroid.x
         gdf['centroid_y'] = gdf.geometry.centroid.y
         gdf = gdf.drop('geometry', axis=1)
-        pd.DataFrame(gdf).to_parquet(os.path.join(workdir, 'tables', 'drain_table.parquet'))
+        write_table(pd.DataFrame(gdf), workdir, 'drain_table')
     return
 
 
-def hindcast(workdir: str, hind_nc_path: str = None, ) -> None:
+def hindcast(workdir: str, hind_nc_path: str) -> None:
     """
     Creates hindcast_series_table.parquet.gzip and hindcast_fdc_table.parquet.gzip in the workdir/tables directory
     for the GEOGloWS hindcast data
@@ -56,11 +56,8 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None:
     Returns:
         None
     """
-    if hind_nc_path is None:
-        hind_nc_path = guess_hindcast_path(workdir)
-
     # read the assignments table
-    drain_table = read_drain_table(workdir)
+    drain_table = read_table(workdir, 'drain_table')
     model_ids = list(set(sorted(drain_table[mid_col].tolist())))
 
     # read the hindcast netcdf, convert to dataframe, store as parquet
@@ -70,7 +67,7 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None:
     ids = ids[ids_selector].astype(str).values.flatten()
 
     # save the model ids to table for reference
-    pd.DataFrame(ids, columns=['model_id', ]).to_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet'))
+    write_table(pd.DataFrame(ids, columns=['model_id', ]), workdir, 'model_ids')
 
     # save the hindcast series to parquet
     df = pd.DataFrame(
@@ -80,24 +77,24 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None:
     )
     df = df[df.index.year >= 1980]
     df.index.name = 'datetime'
-    df.to_parquet(get_table_path(workdir, 'hindcast_series'))
+    write_table(df, workdir, 'hindcast_series')
 
     # calculate the FDC and save to parquet
     exceed_prob = np.linspace(0, 100, 101)[::-1]
     df = df.apply(lambda x: np.transpose(np.nanpercentile(x, exceed_prob)))
     df.index = exceed_prob
     df.index.name = 'exceed_prob'
-    df.to_parquet(get_table_path(workdir, 'hindcast_fdc'))
+    write_table(df, workdir, 'hindcast_fdc')
 
     # transform and prepare for clustering
     df = pd.DataFrame(np.transpose(Scalar().fit_transform(np.squeeze(df.values))))
     df.index = ids
     df.columns = df.columns.astype(str)
-    df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet'))
+    write_table(df, workdir, 'hindcast_fdc_trans')
     return
 
 
-def workdir(path: str, include_validation: bool = True) -> None:
+def scaffold(path: str, include_validation: bool = True) -> None:
     """
     Creates the correct directories for a Saber project within the specified directory
 
@@ -108,9 +105,9 @@ def workdir(path: str, include_validation: bool = True) -> None:
     Returns:
         None
     """
-    dir_list = ['tables', 'data_inputs', 'gis_outputs', 'kmeans_outputs']
+    dir_list = ['tables', 'gis', 'clusters']
     if include_validation:
-        dir_list.append('validation_runs')
+        dir_list.append('validation')
     for d in dir_list:
         p = os.path.join(path, d)
         if not os.path.exists(p):
diff --git a/saber/table.py b/saber/table.py
index 9edaa33..c8f97cb 100644
--- a/saber/table.py
+++ b/saber/table.py
@@ -72,8 +72,8 @@ def read(workdir: str) -> pd.DataFrame:
 
 def cache(workdir: str, assign_table: pd.DataFrame) -> None:
     """
-    Saves the pandas dataframe to a parquet in the proper place in the project directory
-    A shortcut for pd.DataFrame.to_parquet so you don't have to code the paths every where
+    Saves the assignment table dataframe to parquet in the proper place in the project directory so that
+    you don't have to code the path every time
 
     Args:
         workdir: the project directory path
@@ -101,7 +101,7 @@ def merge_clusters(workdir: str, assign_table: pd.DataFrame, n_clusters: int = N
     """
     # create a dataframe with the optimal model's labels and the model_id's
     df = pd.DataFrame({
-        'cluster': joblib.load(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_,
+        'cluster': joblib.load(os.path.join(workdir, 'clusters', f'kmeans-{n_clusters}.pickle')).labels_,
         mid_col: pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten()
     }, dtype=str)
 
diff --git a/saber/validate.py b/saber/validate.py
index 1de3c9b..24095b6 100644
--- a/saber/validate.py
+++ b/saber/validate.py
@@ -25,13 +25,13 @@ def sample_gauges(workdir: str, overwrite: bool = False) -> None:
     Returns:
         None
     """
-    vr_path = os.path.join(workdir, 'validation_runs')
+    vr_path = os.path.join(workdir, 'validation')
     if overwrite:
         if os.path.exists(vr_path):
             shutil.rmtree(vr_path)
         os.mkdir(vr_path)
 
-    gt = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'gauge_table.csv'))
+    gt = pd.read_csv(os.path.join(workdir, 'gis', 'gauge_table.csv'))
     initial_row_count = gt.shape[0]
     rows_to_drop = round(gt.shape[0] / 10)
 
@@ -54,9 +54,9 @@ def sample_gauges(workdir: str, overwrite: bool = False) -> None:
 
         # sample the gauge table
         gt = gt.sample(n=n)
-        gt.to_csv(os.path.join(subpath, 'gis_inputs', 'gauge_table.csv'), index=False)
-        shutil.copyfile(os.path.join(workdir, 'gis_inputs', 'drain_table.csv'),
-                        os.path.join(subpath, 'gis_inputs', 'drain_table.csv'))
+        gt.to_csv(os.path.join(subpath, 'gis', 'gauge_table.csv'), index=False)
+        shutil.copyfile(os.path.join(workdir, 'gis', 'drain_table.csv'),
+                        os.path.join(subpath, 'gis', 'drain_table.csv'))
 
         # filter the copied processed data to only contain the gauges included in this filtered step
         processed_sim_data = glob.glob(os.path.join(subpath, 'data_processed', 'obs-*.csv'))
@@ -79,7 +79,7 @@ def gen_val_table(workdir: str) -> pd.DataFrame:
     Returns:
         pandas.DataFrame
     """
-    df = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'gauge_table.csv'))
+    df = pd.read_csv(os.path.join(workdir, 'gis', 'gauge_table.csv'))
     df['100'] = 1
 
     stats_df = {}
@@ -92,11 +92,11 @@ def gen_val_table(workdir: str) -> pd.DataFrame:
     a.close()
 
     for d in sorted(
-            [a for a in glob.glob(os.path.join(workdir, 'validation_runs', '*')) if os.path.isdir(a)],
+            [a for a in glob.glob(os.path.join(workdir, 'validation', '*')) if os.path.isdir(a)],
             reverse=True
     ):
         val_percent = os.path.basename(d)
-        valset_gids = pd.read_csv(os.path.join(d, 'gis_inputs', 'gauge_table.csv'))[gid_col].values.tolist()
+        valset_gids = pd.read_csv(os.path.join(d, 'gis', 'gauge_table.csv'))[gid_col].values.tolist()
 
         # mark a column indicating the gauges included in the validation set
         df[val_percent] = 0
@@ -110,6 +110,6 @@ def gen_val_table(workdir: str) -> pd.DataFrame:
 
     # merge gauge_table with the stats, save and return
     df = df.merge(pd.DataFrame(stats_df), on=mid_col, how='inner')
-    df.to_csv(os.path.join(workdir, 'validation_runs', 'val_table.csv'), index=False)
+    df.to_csv(os.path.join(workdir, 'validation', 'val_table.csv'), index=False)
 
     return df

From 0670441c55a563993c12bba8e2ed9d1d098831ad Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Sun, 28 Aug 2022 15:53:39 -0600
Subject: [PATCH 20/24] turn _vocab into io and merge scaffold functions

---
 saber/__init__.py             | 13 ++++--
 saber/_calibrate.py           | 24 +++++------
 saber/_propagation.py         | 15 +++----
 saber/{utils.py => _utils.py} |  0
 saber/{table.py => assign.py} | 77 +++++++++--------------------------
 saber/cluster.py              |  5 +--
 saber/gis.py                  |  8 ++--
 saber/{_vocab.py => io.py}    | 39 ++++++++++++++----
 saber/prep.py                 | 35 +++-------------
 saber/validate.py             |  8 ++--
 10 files changed, 96 insertions(+), 128 deletions(-)
 rename saber/{utils.py => _utils.py} (100%)
 rename saber/{table.py => assign.py} (72%)
 rename saber/{_vocab.py => io.py} (70%)

diff --git a/saber/__init__.py b/saber/__init__.py
index 1b0ea72..d9eeb2c 100644
--- a/saber/__init__.py
+++ b/saber/__init__.py
@@ -1,13 +1,18 @@
+import saber.assign
 import saber.cluster
 import saber.gis
+import saber.io
 import saber.prep
-import saber.table
-import saber.utils
 import saber.validate
 from saber._calibrate import calibrate, calibrate_region
 
-__all__ = ['calibrate', 'calibrate_region',
-           'table', 'prep', 'gis', 'cluster', 'utils', 'validate']
+__all__ = [
+    # individual functions
+    'calibrate', 'calibrate_region',
+    # modules
+    'io', 'prep', 'cluster', 'assign', 'gis', 'validate',
+]
+
 __author__ = 'Riley Hales'
 __version__ = '0.4.0'
 __license__ = 'BSD 3 Clause Clear'
diff --git a/saber/_calibrate.py b/saber/_calibrate.py
index 037a642..11a788a 100644
--- a/saber/_calibrate.py
+++ b/saber/_calibrate.py
@@ -9,14 +9,14 @@
 import pandas as pd
 from scipy import interpolate, stats
 
-from .utils import solve_gumbel1, compute_fdc, compute_scalar_fdc
-from ._vocab import mid_col
-from ._vocab import asgn_mid_col
-from ._vocab import asgn_gid_col
-from ._vocab import metric_list
-from ._vocab import metric_nc_name_list
-from ._vocab import table_hindcast
-from ._vocab import cal_nc_name
+from ._utils import solve_gumbel1, compute_fdc, compute_scalar_fdc
+from .io import asgn_gid_col
+from .io import asgn_mid_col
+from .io import cal_nc_name
+from .io import metric_list
+from .io import metric_nc_name_list
+from .io import mid_col
+from .io import table_hindcast
 
 
 def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd.DataFrame = None,
@@ -61,9 +61,9 @@ def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd
     if fix_seasonally:
         # list of the unique months in the historical simulation. should always be 1->12 but just in case...
         monthly_results = []
-        for month in sorted(set(sim_flow_a.index.strftime('%m'))):
+        for month in sorted(set(sim_flow_a.index.dt.strftime('%m'))):
             # filter data to current iteration's month
-            mon_obs_data = obs_flow_a[obs_flow_a.index.month == int(month)].dropna()
+            mon_obs_data = obs_flow_a[obs_flow_a.index.dt.month == int(month)].dropna()
 
             if mon_obs_data.empty:
                 if empty_months == 'skip':
@@ -71,8 +71,8 @@ def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd
                 else:
                     raise ValueError(f'Invalid value for argument "empty_months". Given: {empty_months}.')
 
-            mon_sim_data = sim_flow_a[sim_flow_a.index.month == int(month)].dropna()
-            mon_cor_data = sim_flow_b[sim_flow_b.index.month == int(month)].dropna()
+            mon_sim_data = sim_flow_a[sim_flow_a.index.dt.month == int(month)].dropna()
+            mon_cor_data = sim_flow_b[sim_flow_b.index.dt.month == int(month)].dropna()
             monthly_results.append(calibrate(
                 mon_sim_data, mon_obs_data, mon_cor_data,
                 fix_seasonally=False, empty_months=empty_months,
diff --git a/saber/_propagation.py b/saber/_propagation.py
index 5c558f5..b738a95 100644
--- a/saber/_propagation.py
+++ b/saber/_propagation.py
@@ -1,11 +1,11 @@
 import pandas as pd
 
-from ._vocab import mid_col
-from ._vocab import down_mid_col
-from ._vocab import order_col
-from ._vocab import asgn_mid_col
-from ._vocab import asgn_gid_col
-from ._vocab import reason_col
+from .io import asgn_gid_col
+from .io import asgn_mid_col
+from .io import down_mid_col
+from .io import mid_col
+from .io import order_col
+from .io import reason_col
 
 
 def walk_downstream(df: pd.DataFrame, start_id: int, same_order: bool = True, outlet_next_id: str or int = -1) -> tuple:
@@ -71,7 +71,8 @@ def walk_upstream(df: pd.DataFrame, start_id: int, same_order: bool = True) -> t
     return tuple(set(upstream_ids))
 
 
-def propagate_in_table(df: pd.DataFrame, start_mid: int, start_gid: int, connected: tuple, max_prop: int, direction: str):
+def propagate_in_table(df: pd.DataFrame, start_mid: int, start_gid: int, connected: tuple, max_prop: int,
+                       direction: str):
     """
 
     Args:
diff --git a/saber/utils.py b/saber/_utils.py
similarity index 100%
rename from saber/utils.py
rename to saber/_utils.py
diff --git a/saber/table.py b/saber/assign.py
similarity index 72%
rename from saber/table.py
rename to saber/assign.py
index c8f97cb..5017d26 100644
--- a/saber/table.py
+++ b/saber/assign.py
@@ -7,85 +7,48 @@
 from ._propagation import propagate_in_table
 from ._propagation import walk_downstream
 from ._propagation import walk_upstream
-from ._vocab import asgn_gid_col
-from ._vocab import asgn_mid_col
-from ._vocab import gid_col
-from ._vocab import mid_col
-from ._vocab import order_col
-from ._vocab import reason_col
+from .io import asgn_gid_col
+from .io import asgn_mid_col
+from .io import gid_col
+from .io import mid_col
+from .io import order_col
+from .io import read_table
+from .io import reason_col
+from .io import write_table
 
-__all__ = ['get_path', 'gen', 'read', 'cache', 'merge_clusters', 'assign_gauged', 'assign_propagation',
-           'assign_by_distance', ]
+__all__ = ['gen', 'merge_clusters', 'assign_gauged', 'assign_propagation', 'assign_by_distance', ]
 
 
-def get_path(workdir: str) -> str:
-    """
-    Creates the path to the assign table based on the working directory provided
-
-    Args:
-        workdir: path to the working directory
-
-    Returns:
-        absolute path of the assign_table
-    """
-    return os.path.join(workdir, 'assign_table.parquet.gzip')
-
-
-def gen(workdir) -> pd.DataFrame:
+def gen(workdir: str, cache: bool = True) -> pd.DataFrame:
     """
     Joins the drain_table.csv and gauge_table.csv to create the assign_table.csv
 
     Args:
         workdir: path to the working directory
+        cache: whether to cache the assign table immediately
 
     Returns:
         None
     """
     # read and merge the tables
-    drain_df = pd.read_parquet(os.path.join(workdir, 'tables', 'drain_table.parquet'))
-    gauge_df = pd.read_parquet(os.path.join(workdir, 'tables', 'gauge_table.parquet'))
-    assign_table = pd.merge(drain_df, gauge_df, on=mid_col, how='outer')
+    assign_table = pd.merge(
+        read_table(workdir, 'drain_table'),
+        read_table(workdir, 'gauge_table'),
+        on=mid_col,
+        how='outer'
+    )
 
     # create the new columns
     assign_table[asgn_mid_col] = np.nan
     assign_table[asgn_gid_col] = np.nan
     assign_table[reason_col] = np.nan
 
-    # cache the table
-    cache(workdir, assign_table)
+    if cache:
+        write_table(assign_table, workdir, 'assign_table')
 
     return assign_table
 
 
-def read(workdir: str) -> pd.DataFrame:
-    """
-    Reads the assign_table located in the provided directory
-
-    Args:
-        workdir: path to the working directory
-
-    Returns:
-        assign_table pandas.DataFrame
-    """
-    return pd.read_parquet(get_path(workdir))
-
-
-def cache(workdir: str, assign_table: pd.DataFrame) -> None:
-    """
-    Saves the assignment table dataframe to parquet in the proper place in the project directory so that
-    you don't have to code the path every time
-
-    Args:
-        workdir: the project directory path
-        assign_table: the assign_table dataframe
-
-    Returns:
-        None
-    """
-    assign_table.to_parquet(get_path(workdir))
-    return
-
-
 def merge_clusters(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) -> pd.DataFrame:
     """
     Creates a csv listing the streams assigned to each cluster in workdir/kmeans_models and also adds that information
@@ -102,7 +65,7 @@ def merge_clusters(workdir: str, assign_table: pd.DataFrame, n_clusters: int = N
     # create a dataframe with the optimal model's labels and the model_id's
     df = pd.DataFrame({
         'cluster': joblib.load(os.path.join(workdir, 'clusters', f'kmeans-{n_clusters}.pickle')).labels_,
-        mid_col: pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten()
+        mid_col: read_table(workdir, 'model_ids').values.flatten()
     }, dtype=str)
 
     # merge the dataframes
diff --git a/saber/cluster.py b/saber/cluster.py
index b26318e..81f8109 100755
--- a/saber/cluster.py
+++ b/saber/cluster.py
@@ -14,8 +14,8 @@
 from sklearn.cluster import KMeans
 from sklearn.metrics import silhouette_samples
 
-from ._vocab import cluster_count_file
-from ._vocab import read_table
+from .io import cluster_count_file
+from .io import read_table
 
 __all__ = ['generate', 'summarize', 'plot_clusters', 'plot_silhouette']
 
@@ -64,7 +64,6 @@ def summarize(workdir: str) -> None:
     for model_file in natsorted(glob.glob(os.path.join(workdir, 'clusters', 'kmeans-*.pickle'))):
         kmeans = joblib.load(model_file)
         n_clusters = int(kmeans.n_clusters)
-        print(n_clusters)
 
         # save the cluster centroids to table - columns are the cluster number, rows are the centroid FDC values
         pd.DataFrame(
diff --git a/saber/gis.py b/saber/gis.py
index e90fa85..f8f6de9 100644
--- a/saber/gis.py
+++ b/saber/gis.py
@@ -8,10 +8,10 @@
 import numpy as np
 import pandas as pd
 
-from ._vocab import gid_col
-from ._vocab import metric_nc_name_list
-from ._vocab import mid_col
-from ._vocab import reason_col
+from .io import gid_col
+from .io import metric_nc_name_list
+from .io import mid_col
+from .io import reason_col
 
 __all__ = ['generate_all', 'clip_by_assignment', 'clip_by_cluster', 'clip_by_unassigned', 'clip_by_ids',
            'validation_maps']
diff --git a/saber/_vocab.py b/saber/io.py
similarity index 70%
rename from saber/_vocab.py
rename to saber/io.py
index 4c293f4..cc4a32d 100644
--- a/saber/_vocab.py
+++ b/saber/io.py
@@ -2,7 +2,6 @@
 
 import pandas as pd
 
-
 # assign table and gis_input file required column names
 mid_col = 'model_id'
 gid_col = 'gauge_id'
@@ -29,12 +28,34 @@
 table_mids = 'mid_table.parquet'
 table_drain = 'drain_table.parquet'
 table_gauge = 'gauge_table.parquet'
+table_assign = 'assign_table.csv'
 
 # metrics computed on validation sets
 metric_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE (2012)']
 metric_nc_name_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE2012']
 
 
+def scaffold_workdir(path: str, include_validation: bool = True) -> None:
+    """
+    Creates the correct directories for a Saber project within the specified directory
+
+    Args:
+        path: the path to a directory where you want to create workdir subdirectories
+        include_validation: boolean, indicates whether to create the validation folder
+
+    Returns:
+        None
+    """
+    dir_list = ['tables', 'gis', 'clusters']
+    if include_validation:
+        dir_list.append('validation')
+    for d in dir_list:
+        p = os.path.join(path, d)
+        if not os.path.exists(p):
+            os.mkdir(p)
+    return
+
+
 def get_table_path(workdir: str, table_name: str) -> str:
     if table_name == 'hindcast':
         return os.path.join(workdir, tables, table_hindcast)
@@ -42,12 +63,14 @@ def get_table_path(workdir: str, table_name: str) -> str:
         return os.path.join(workdir, tables, table_hindcast_fdc)
     elif table_name == 'hindcast_fdc_trans':
         return os.path.join(workdir, tables, table_hindcast_fdc)
-    elif table_name == 'mid_list':
+    elif table_name == 'model_ids':
         return os.path.join(workdir, tables, table_mids)
     elif table_name == 'drain_table':
         return os.path.join(workdir, tables, table_drain)
     elif table_name == 'gauge_table':
         return os.path.join(workdir, tables, table_gauge)
+    elif table_name == 'assign_table':
+        return os.path.join(workdir, tables, table_assign)
     else:
         raise ValueError(f'Unknown table name: {table_name}')
 
@@ -55,11 +78,11 @@ def get_table_path(workdir: str, table_name: str) -> str:
 def read_table(workdir: str, table_name: str) -> pd.DataFrame:
     table_path = get_table_path(workdir, table_name)
     table_format = os.path.splitext(table_path)[-1]
-    if table_format == 'parquet':
+    if table_format == '.parquet':
         return pd.read_parquet(table_path)
-    elif table_format == 'feather':
+    elif table_format == '.feather':
         return pd.read_feather(table_path)
-    elif table_format == 'csv':
+    elif table_format == '.csv':
         return pd.read_csv(table_path)
     else:
         raise ValueError(f'Unknown table format: {table_format}')
@@ -68,11 +91,11 @@ def read_table(workdir: str, table_name: str) -> pd.DataFrame:
 def write_table(table: pd.DataFrame, workdir: str, table_name: str) -> None:
     table_path = get_table_path(workdir, table_name)
     table_format = os.path.splitext(table_path)[-1]
-    if table_format == 'parquet':
+    if table_format == '.parquet':
         table.to_parquet(table_path)
-    elif table_format == 'feather':
+    elif table_format == '.feather':
         table.to_feather(table_path)
-    elif table_format == 'csv':
+    elif table_format == '.csv':
         table.to_csv(table_path)
     else:
         raise ValueError(f'Unknown table format: {table_format}')
diff --git a/saber/prep.py b/saber/prep.py
index 01e36f2..1cc1c2b 100755
--- a/saber/prep.py
+++ b/saber/prep.py
@@ -1,16 +1,14 @@
-import os
-
 import geopandas as gpd
 import netCDF4 as nc
 import numpy as np
 import pandas as pd
 from sklearn.preprocessing import StandardScaler as Scalar
 
-from ._vocab import mid_col
-from ._vocab import read_table
-from ._vocab import write_table
+from .io import mid_col
+from .io import read_table
+from .io import write_table
 
-__all__ = ['gis_tables', 'hindcast', 'scaffold']
+__all__ = ['gis_tables', 'hindcast']
 
 
 def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> None:
@@ -75,9 +73,9 @@ def hindcast(workdir: str, hind_nc_path: str) -> None:
         columns=ids,
         index=pd.to_datetime(hnc.variables['time'][:], unit='s')
     )
-    df = df[df.index.year >= 1980]
+    df = df[df.index.dt.year >= 1980]
     df.index.name = 'datetime'
-    write_table(df, workdir, 'hindcast_series')
+    write_table(df, workdir, 'hindcast')
 
     # calculate the FDC and save to parquet
     exceed_prob = np.linspace(0, 100, 101)[::-1]
@@ -92,24 +90,3 @@ def hindcast(workdir: str, hind_nc_path: str) -> None:
     df.columns = df.columns.astype(str)
     write_table(df, workdir, 'hindcast_fdc_trans')
     return
-
-
-def scaffold(path: str, include_validation: bool = True) -> None:
-    """
-    Creates the correct directories for a Saber project within the specified directory
-
-    Args:
-        path: the path to a directory where you want to create workdir subdirectories
-        include_validation: boolean, indicates whether to create the validation folder
-
-    Returns:
-        None
-    """
-    dir_list = ['tables', 'gis', 'clusters']
-    if include_validation:
-        dir_list.append('validation')
-    for d in dir_list:
-        p = os.path.join(path, d)
-        if not os.path.exists(p):
-            os.mkdir(p)
-    return
diff --git a/saber/validate.py b/saber/validate.py
index 24095b6..6690dd6 100644
--- a/saber/validate.py
+++ b/saber/validate.py
@@ -6,10 +6,10 @@
 import numpy as np
 import pandas as pd
 
-from ._vocab import cal_nc_name
-from ._vocab import gid_col
-from ._vocab import metric_nc_name_list
-from ._vocab import mid_col
+from .io import cal_nc_name
+from .io import gid_col
+from .io import metric_nc_name_list
+from .io import mid_col
 
 
 def sample_gauges(workdir: str, overwrite: bool = False) -> None:

From e20159d40042d973799fbbe9cd42193ac06ff1bf Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Mon, 29 Aug 2022 12:31:10 -0600
Subject: [PATCH 21/24] more compliance using the io module

---
 saber/_calibrate.py | 21 ++++++++++-----------
 saber/io.py         | 39 +++++++++++++++++++++------------------
 saber/prep.py       | 18 +++++++++++-------
 saber/validate.py   |  8 +++++---
 4 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/saber/_calibrate.py b/saber/_calibrate.py
index 11a788a..597ce05 100644
--- a/saber/_calibrate.py
+++ b/saber/_calibrate.py
@@ -61,9 +61,9 @@ def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd
     if fix_seasonally:
         # list of the unique months in the historical simulation. should always be 1->12 but just in case...
         monthly_results = []
-        for month in sorted(set(sim_flow_a.index.dt.strftime('%m'))):
+        for month in sorted(set(sim_flow_a.index.strftime('%m'))):
             # filter data to current iteration's month
-            mon_obs_data = obs_flow_a[obs_flow_a.index.dt.month == int(month)].dropna()
+            mon_obs_data = obs_flow_a[obs_flow_a.index.month == int(month)].dropna()
 
             if mon_obs_data.empty:
                 if empty_months == 'skip':
@@ -71,8 +71,8 @@ def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd
                 else:
                     raise ValueError(f'Invalid value for argument "empty_months". Given: {empty_months}.')
 
-            mon_sim_data = sim_flow_a[sim_flow_a.index.dt.month == int(month)].dropna()
-            mon_cor_data = sim_flow_b[sim_flow_b.index.dt.month == int(month)].dropna()
+            mon_sim_data = sim_flow_a[sim_flow_a.index.month == int(month)].dropna()
+            mon_cor_data = sim_flow_b[sim_flow_b.index.month == int(month)].dropna()
             monthly_results.append(calibrate(
                 mon_sim_data, mon_obs_data, mon_cor_data,
                 fix_seasonally=False, empty_months=empty_months,
@@ -139,22 +139,21 @@ def _make_interpolator(x_input: np.array, y_output: np.array, extrap_method: str
                        const: int or float = None) -> interpolate.interp1d:
     # make interpolator which converts the percentiles to scalars
     if extrap_method == 'nearest':
-        interpolator = interpolate.interp1d(x_input, y_output, fill_value='extrapolate', kind='nearest')
+        return interpolate.interp1d(x_input, y_output, fill_value='extrapolate', kind='nearest')
     elif extrap_method == 'const':
         if const is None:
             raise ValueError('Must provide the const kwarg when extrap_method="const"')
-        interpolator = interpolate.interp1d(x_input, y_output, fill_value=const, bounds_error=False)
+        return interpolate.interp1d(x_input, y_output, fill_value=const, bounds_error=False)
     elif extrap_method == 'linear':
-        interpolator = interpolate.interp1d(x_input, y_output, fill_value='extrapolate')
+        return interpolate.interp1d(x_input, y_output, fill_value='extrapolate')
     elif extrap_method == 'average':
-        interpolator = interpolate.interp1d(x_input, y_output, fill_value=np.mean(y_output), bounds_error=False)
+        return interpolate.interp1d(x_input, y_output, fill_value=np.mean(y_output), bounds_error=False)
     elif extrap_method == 'max' or extrap_method == 'maximum':
-        interpolator = interpolate.interp1d(x_input, y_output, fill_value=np.max(y_output), bounds_error=False)
+        return interpolate.interp1d(x_input, y_output, fill_value=np.max(y_output), bounds_error=False)
     elif extrap_method == 'min' or extrap_method == 'minimum':
-        interpolator = interpolate.interp1d(x_input, y_output, fill_value=np.min(y_output), bounds_error=False)
+        return interpolate.interp1d(x_input, y_output, fill_value=np.min(y_output), bounds_error=False)
     else:
         raise ValueError('Invalid extrapolation method provided')
-    return interpolator
 
 
 def calibrate_region(workdir: str, assign_table: pd.DataFrame,
diff --git a/saber/io.py b/saber/io.py
index cc4a32d..8436a2d 100644
--- a/saber/io.py
+++ b/saber/io.py
@@ -10,17 +10,18 @@
 down_mid_col = 'downstream_model_id'
 reason_col = 'reason'
 area_col = 'drain_area'
-order_col = 'stream_order'
-
-# scaffolded folders
-tables = 'tables'
-gis_out = 'gis'
-kmeans_out = 'clusters'
+order_col = 'strahler_order'
 
 # name of some files produced by the algorithm
 cluster_count_file = 'best-fit-cluster-count.json'
 cal_nc_name = 'calibrated_simulated_flow.nc'
 
+# scaffolded folders
+dir_tables = 'tables'
+dir_gis = 'gis'
+dir_clusters = 'clusters'
+dir_valid = 'validation'
+
 # name of the required input tables and the outputs
 table_hindcast = 'hindcast.parquet'
 table_hindcast_fdc = 'hindcast_fdc.parquet'
@@ -46,9 +47,11 @@ def scaffold_workdir(path: str, include_validation: bool = True) -> None:
     Returns:
         None
     """
-    dir_list = ['tables', 'gis', 'clusters']
+    dir_list = [dir_tables, dir_gis, dir_clusters]
+    if not os.path.exists(path):
+        os.mkdir(path)
     if include_validation:
-        dir_list.append('validation')
+        dir_list.append(dir_valid)
     for d in dir_list:
         p = os.path.join(path, d)
         if not os.path.exists(p):
@@ -58,19 +61,19 @@ def scaffold_workdir(path: str, include_validation: bool = True) -> None:
 
 def get_table_path(workdir: str, table_name: str) -> str:
     if table_name == 'hindcast':
-        return os.path.join(workdir, tables, table_hindcast)
+        return os.path.join(workdir, dir_tables, table_hindcast)
     elif table_name == 'hindcast_fdc':
-        return os.path.join(workdir, tables, table_hindcast_fdc)
+        return os.path.join(workdir, dir_tables, table_hindcast_fdc)
     elif table_name == 'hindcast_fdc_trans':
-        return os.path.join(workdir, tables, table_hindcast_fdc)
+        return os.path.join(workdir, dir_tables, table_hindcast_fdc)
     elif table_name == 'model_ids':
-        return os.path.join(workdir, tables, table_mids)
+        return os.path.join(workdir, dir_tables, table_mids)
     elif table_name == 'drain_table':
-        return os.path.join(workdir, tables, table_drain)
+        return os.path.join(workdir, dir_tables, table_drain)
     elif table_name == 'gauge_table':
-        return os.path.join(workdir, tables, table_gauge)
+        return os.path.join(workdir, dir_tables, table_gauge)
     elif table_name == 'assign_table':
-        return os.path.join(workdir, tables, table_assign)
+        return os.path.join(workdir, dir_tables, table_assign)
     else:
         raise ValueError(f'Unknown table name: {table_name}')
 
@@ -92,10 +95,10 @@ def write_table(table: pd.DataFrame, workdir: str, table_name: str) -> None:
     table_path = get_table_path(workdir, table_name)
     table_format = os.path.splitext(table_path)[-1]
     if table_format == '.parquet':
-        table.to_parquet(table_path)
+        return table.to_parquet(table_path)
     elif table_format == '.feather':
-        table.to_feather(table_path)
+        return table.to_feather(table_path)
     elif table_format == '.csv':
-        table.to_csv(table_path)
+        return table.to_csv(table_path)
     else:
         raise ValueError(f'Unknown table format: {table_format}')
diff --git a/saber/prep.py b/saber/prep.py
index 1cc1c2b..753ba17 100755
--- a/saber/prep.py
+++ b/saber/prep.py
@@ -5,6 +5,7 @@
 from sklearn.preprocessing import StandardScaler as Scalar
 
 from .io import mid_col
+from .io import order_col
 from .io import read_table
 from .io import write_table
 
@@ -42,21 +43,24 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No
     return
 
 
-def hindcast(workdir: str, hind_nc_path: str) -> None:
+def hindcast(workdir: str, hind_nc_path: str, drop_order_1: bool = False) -> None:
     """
-    Creates hindcast_series_table.parquet.gzip and hindcast_fdc_table.parquet.gzip in the workdir/tables directory
+    Creates hindcast_series_table.parquet and hindcast_fdc_table.parquet in the workdir/tables directory
     for the GEOGloWS hindcast data
 
     Args:
         workdir: path to the working directory for the project
-        hind_nc_path: path to the hindcast or historical simulation netcdf if not located at workdir/data_simulated/*.nc
+        hind_nc_path: path to the hindcast or historical simulation netcdf
+        drop_order_1: whether to drop the order 1 streams from the hindcast
 
     Returns:
         None
     """
     # read the assignments table
-    drain_table = read_table(workdir, 'drain_table')
-    model_ids = list(set(sorted(drain_table[mid_col].tolist())))
+    model_ids = read_table(workdir, 'drain_table')
+    if drop_order_1:
+        model_ids = model_ids[model_ids[order_col] > 1]
+    model_ids = list(set(sorted(model_ids[mid_col].tolist())))
 
     # read the hindcast netcdf, convert to dataframe, store as parquet
     hnc = nc.Dataset(hind_nc_path)
@@ -65,7 +69,7 @@ def hindcast(workdir: str, hind_nc_path: str) -> None:
     ids = ids[ids_selector].astype(str).values.flatten()
 
     # save the model ids to table for reference
-    write_table(pd.DataFrame(ids, columns=['model_id', ]), workdir, 'model_ids')
+    write_table(pd.DataFrame(ids, columns=[mid_col, ]), workdir, 'model_ids')
 
     # save the hindcast series to parquet
     df = pd.DataFrame(
@@ -73,7 +77,7 @@ def hindcast(workdir: str, hind_nc_path: str) -> None:
         columns=ids,
         index=pd.to_datetime(hnc.variables['time'][:], unit='s')
     )
-    df = df[df.index.dt.year >= 1980]
+    df = df[df.index.year >= 1980]
     df.index.name = 'datetime'
     write_table(df, workdir, 'hindcast')
 
diff --git a/saber/validate.py b/saber/validate.py
index 6690dd6..f40a140 100644
--- a/saber/validate.py
+++ b/saber/validate.py
@@ -10,6 +10,8 @@
 from .io import gid_col
 from .io import metric_nc_name_list
 from .io import mid_col
+from .io import read_table
+from .io import scaffold_workdir
 
 
 def sample_gauges(workdir: str, overwrite: bool = False) -> None:
@@ -31,8 +33,9 @@ def sample_gauges(workdir: str, overwrite: bool = False) -> None:
             shutil.rmtree(vr_path)
         os.mkdir(vr_path)
 
-    gt = pd.read_csv(os.path.join(workdir, 'gis', 'gauge_table.csv'))
+    gt = read_table(workdir, 'gauge_table')
     initial_row_count = gt.shape[0]
+
     rows_to_drop = round(gt.shape[0] / 10)
 
     for i in range(5):
@@ -42,8 +45,7 @@ def sample_gauges(workdir: str, overwrite: bool = False) -> None:
         subpath = os.path.join(vr_path, f'{pct_remain}')
 
         # create the new project working directory
-        os.mkdir(subpath)
-        workdir(subpath, include_validation=False)
+        scaffold_workdir(subpath, include_validation=False)
 
         # overwrite the processed data directory so we don't need to redo this each time
         shutil.copytree(

From 610044d54dd53782164ff4cae0741555aa2ccc35 Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Mon, 29 Aug 2022 15:14:41 -0600
Subject: [PATCH 22/24] organize and debug calibration function

---
 conda-environment.yml |  18 ++--
 requirements.txt      |  18 ++--
 saber/_calibrate.py   | 246 +++++++++++++++++++++++++++++++-----------
 saber/_utils.py       |  57 ----------
 4 files changed, 202 insertions(+), 137 deletions(-)
 delete mode 100644 saber/_utils.py

diff --git a/conda-environment.yml b/conda-environment.yml
index 725c053..f7879c0 100644
--- a/conda-environment.yml
+++ b/conda-environment.yml
@@ -4,17 +4,17 @@ channels:
   - defaults
 dependencies:
   - python=3
-  - geopandas
-  - pandas
-  - numpy
-  - numba
   - contextily
-  - netCDF4
-  - pyarrow
+  - geopandas
+  - hydrostats
+  - joblib
   - kneed
   - matplotlib
+  - natsort
+  - netCDF4
+  - numba
+  - numpy
+  - pandas
+  - pyarrow
   - requests
   - scipy
-  - natsort
-  - scikit-learn
-  - joblib
diff --git a/requirements.txt b/requirements.txt
index 67c3de9..1418fb2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,15 @@
 python=3
-geopandas
-pandas
-numpy
-numba
 contextily
-netCDF4
-pyarrow
+geopandas
+hydrostats
+joblib
 kneed
 matplotlib
+natsort
+netCDF4
+numba
+numpy
+pandas
+pyarrow
 requests
 scipy
-natsort
-scikit-learn
-joblib
\ No newline at end of file
diff --git a/saber/_calibrate.py b/saber/_calibrate.py
index 597ce05..9a1aa08 100644
--- a/saber/_calibrate.py
+++ b/saber/_calibrate.py
@@ -9,7 +9,6 @@
 import pandas as pd
 from scipy import interpolate, stats
 
-from ._utils import solve_gumbel1, compute_fdc, compute_scalar_fdc
 from .io import asgn_gid_col
 from .io import asgn_mid_col
 from .io import cal_nc_name
@@ -23,8 +22,8 @@ def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd
               fix_seasonally: bool = True, empty_months: str = 'skip',
               drop_outliers: bool = False, outlier_threshold: int or float = 2.5,
               filter_scalar_fdc: bool = False, filter_range: tuple = (0, 80),
-              extrapolate_method: str = 'nearest', fill_value: int or float = None,
-              fit_gumbel: bool = False, gumbel_range: tuple = (25, 75), ) -> pd.DataFrame:
+              extrapolate: str = 'nearest', fill_value: int or float = None,
+              fit_gumbel: bool = False, fit_range: tuple = (10, 90), ) -> pd.DataFrame:
     """
     Removes the bias from simulated discharge using the SABER method.
 
@@ -39,17 +38,22 @@ def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd
         sim_flow_b (pd.DataFrame): (optional) simulated hydrograph at point B to correct using scalar flow duration
             curve mapping and the bias relationship at point A. should contain a datetime index with daily values
             and a single column of discharge values.
+
         fix_seasonally (bool): fix on a monthly (True) or annual (False) basis
         empty_months (str): how to handle months in the simulated data where no observed data are available. Options:
             "skip": ignore simulated data for months without
+
         drop_outliers (bool): flag to exclude outliers
         outlier_threshold (int or float): number of std deviations from mean to exclude from flow duration curve
-        filter_scalar_fdc (bool):
-        filter_range (tuple):
-        extrapolate_method (str):
-        fill_value (int or float):
-        fit_gumbel (bool):
-        gumbel_range (tuple):
+
+        filter_scalar_fdc (bool): flag to filter the scalar flow duration curve
+        filter_range (tuple): lower and upper bounds of the filter range
+
+        extrapolate (str): method to use for extrapolation. Options: nearest, const, linear, average, max, min
+        fill_value (int or float): value to use for extrapolation when extrapolate_method='const'
+
+        fit_gumbel (bool): flag to replace extremely low/high corrected flows with values from Gumbel type 1
+        fit_range (tuple): lower and upper bounds of exceedance probabilities to replace with Gumbel values
 
     Returns:
         pd.DataFrame with a DateTime index and columns with corrected flow, uncorrected flow, the scalar adjustment
@@ -78,84 +82,202 @@ def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd
                 fix_seasonally=False, empty_months=empty_months,
                 drop_outliers=drop_outliers, outlier_threshold=outlier_threshold,
                 filter_scalar_fdc=filter_scalar_fdc, filter_range=filter_range,
-                extrapolate_method=extrapolate_method, fill_value=fill_value,
-                fit_gumbel=fit_gumbel, gumbel_range=gumbel_range, )
+                extrapolate=extrapolate, fill_value=fill_value,
+                fit_gumbel=fit_gumbel, fit_range=fit_range, )
             )
         # combine the results from each monthly into a single dataframe (sorted chronologically) and return it
         return pd.concat(monthly_results).sort_index()
 
-    # compute the fdc for paired sim/obs data and compute scalar fdc, either with or without outliers
+    # compute the flow duration curves
     if drop_outliers:
-        # drop outlier data before creating flow duration curves
-        # https://stackoverflow.com/questions/23199796/detect-and-exclude-outliers-in-pandas-data-frame
-        sim_fdc = compute_fdc(
-            sim_flow_a[(np.abs(stats.zscore(sim_flow_a)) < outlier_threshold).all(axis=1)])
-        obs_fdc = compute_fdc(
-            obs_flow_a[(np.abs(stats.zscore(obs_flow_a)) < outlier_threshold).all(axis=1)])
+        sim_fdc_a = calc_fdc(_drop_outliers_by_zscore(sim_flow_a, threshold=outlier_threshold), col_name='Q_sim')
+        sim_fdc_b = calc_fdc(_drop_outliers_by_zscore(sim_flow_b, threshold=outlier_threshold), col_name='Q_sim')
+        obs_fdc = calc_fdc(_drop_outliers_by_zscore(obs_flow_a, threshold=outlier_threshold), col_name='Q_obs')
     else:
-        sim_fdc = compute_fdc(sim_flow_a)
-        obs_fdc = compute_fdc(obs_flow_a)
-
-    scalar_fdc = compute_scalar_fdc(sim_fdc['flow'].values.flatten(), obs_fdc['flow'].values.flatten())
+        sim_fdc_a = calc_fdc(sim_flow_a, col_name='Q_sim')
+        sim_fdc_b = calc_fdc(sim_flow_b, col_name='Q_sim')
+        obs_fdc = calc_fdc(obs_flow_a, col_name='Q_obs')
 
+    # calculate the scalar flow duration curve (at point A with simulated and observed data)
+    scalar_fdc = calc_sfdc(sim_fdc_a['flow'].values.flatten(), obs_fdc['flow'].values.flatten())
     if filter_scalar_fdc:
         scalar_fdc = scalar_fdc[scalar_fdc['p_exceed'].between(filter_range[0], filter_range[1])]
 
-    # determine the percentile of each uncorrected flow using the monthly fdc
-    values = sim_flow_b.values.flatten()
-    percentiles = [stats.percentileofscore(values, a) for a in values]
-    scalars = to_scalar(percentiles)
-    values = values / scalars
+    # make interpolators: Q_b -> p_exceed, p_exceed -> scalars_a
+    # flow at B converted to exceedance probabilities, then matched with the scalar computed at point A
+    flow_to_percent = _make_interpolator(sim_fdc_b.values,
+                                         sim_fdc_b.index,
+                                         extrap=extrapolate,
+                                         fill_value=fill_value)
+
+    percent_to_scalar = _make_interpolator(scalar_fdc.index,
+                                           scalar_fdc.values,
+                                           extrap=extrapolate,
+                                           fill_value=fill_value)
+
+    # apply interpolators to correct flows at B with data from A
+    qb_original = sim_flow_b.values.flatten()
+    p_exceed = flow_to_percent(qb_original)
+    scalars = percent_to_scalar(p_exceed)
+    qb_adjusted = qb_original / scalars
 
     if fit_gumbel:
-        tmp = pd.DataFrame(np.transpose([values, percentiles]), columns=('q', 'p'))
+        qb_adjusted = _fit_extreme_values_to_gumbel(qb_adjusted, p_exceed, fit_range)
 
-        # compute the average and standard deviation except for scaled data outside the percentile range specified
-        mid = tmp[tmp['p'] > gumbel_range[0]]
-        mid = mid[mid['p'] < gumbel_range[1]]
-        xbar = statistics.mean(mid['q'].tolist())
-        std = statistics.stdev(mid['q'].tolist(), xbar)
+    return pd.DataFrame(data=np.transpose([qb_adjusted, qb_original, scalars, p_exceed]),
+                        index=sim_flow_b.index.to_list(),
+                        columns=('Q_adjusted', 'Q_original', 'scalars', 'p_exceed'))
 
-        q = []
-        for p in tmp[tmp['p'] <= gumbel_range[0]]['p'].tolist():
-            q.append(solve_gumbel1(std, xbar, 1 / (1 - (p / 100))))
-        tmp.loc[tmp['p'] <= gumbel_range[0], 'q'] = q
 
-        q = []
-        for p in tmp[tmp['p'] >= gumbel_range[1]]['p'].tolist():
-            if p >= 100:
-                p = 99.999
-            q.append(solve_gumbel1(std, xbar, 1 / (1 - (p / 100))))
-        tmp.loc[tmp['p'] >= gumbel_range[1], 'q'] = q
+def calc_fdc(flows: np.array, steps: int = 201, col_name: str = 'Q') -> pd.DataFrame:
+    """
+    Compute flow duration curve (exceedance probabilities) from a list of flows
 
-        values = tmp['q'].values
+    Args:
+        flows: array of flows
+        steps: number of steps (exceedance probabilities) to use in the FDC
+        col_name: name of the column in the returned dataframe
+
+    Returns:
+        pd.DataFrame with index 'p_exceed' and columns 'Q' (or col_name)
+    """
+    # calculate the FDC and save to parquet
+    exceed_prob = np.linspace(100, 0, steps)
+    fdc_flows = np.nanpercentile(flows, exceed_prob)
+    df = pd.DataFrame(fdc_flows, columns=[col_name, ], index=exceed_prob)
+    df.index.name = 'p_exceed'
+    return df
 
-    return pd.DataFrame(data=np.transpose([values, sim_flow_b.values.flatten(), scalars, percentiles]),
-                        index=sim_flow_b.index.to_list(),
-                        columns=('flow', 'uncorrected', 'scalars', 'percentile'))
 
+def calc_sfdc(sim_fdc: pd.DataFrame, obs_fdc: pd.DataFrame) -> pd.DataFrame:
+    """
+    Compute the scalar flow duration curve (exceedance probabilities) from two flow duration curves
+
+    Args:
+        sim_fdc: simulated flow duration curve
+        obs_fdc: observed flow duration curve
+
+    Returns:
+        pd.DataFrame with index 'p_exceed' and columns 'Q'
+    """
+    scalars_df = pd.DataFrame(np.divide(sim_fdc.values, obs_fdc.values), columns=['scalars'], index=sim_fdc.index)
+    scalars_df.replace(np.inf, np.nan, inplace=True)
+    scalars_df.dropna(inplace=True)
+    return scalars_df
 
-def _make_interpolator(x_input: np.array, y_output: np.array, extrap_method: str = 'nearest',
-                       const: int or float = None) -> interpolate.interp1d:
+
+def _drop_outliers_by_zscore(df: pd.DataFrame, threshold: float = 3) -> pd.DataFrame:
+    """
+    Drop outliers from a dataframe by their z-score and a threshold
+    Based on https://stackoverflow.com/questions/23199796/detect-and-exclude-outliers-in-pandas-data-frame
+    Args:
+        df: dataframe to drop outliers from
+        threshold: z-score threshold
+
+    Returns:
+        pd.DataFrame with outliers removed
+    """
+    return df[(np.abs(stats.zscore(df)) < threshold).all(axis=1)]
+
+
+def _filter_sfdc(sfdc: pd.DataFrame, filter_range: list) -> pd.DataFrame:
+    """
+    Filter the scalar flow duration curve by the specified range
+
+    Args:
+        sfdc: scalar flow duration curve DataFrame
+        filter_range: list of [lower_bound: int or float, upper_bound: int or float]
+
+    Returns:
+        pd.DataFrame: filtered scalar flow duration curve
+    """
+    return sfdc[np.logical_and(sfdc.index > filter_range[0], sfdc.index < filter_range[1])]
+
+
+def _make_interpolator(x: np.array, y: np.array, extrap: str = 'nearest',
+                       fill_value: int or float = None) -> interpolate.interp1d:
+    """
+    Make an interpolator from two arrays
+
+    Args:
+        x: x values
+        y: y values
+        extrap: method for extrapolation: nearest, const, linear, average, max, min
+        fill_value: value to use when extrap='const'
+
+    Returns:
+        interpolate.interp1d
+    """
     # make interpolator which converts the percentiles to scalars
-    if extrap_method == 'nearest':
-        return interpolate.interp1d(x_input, y_output, fill_value='extrapolate', kind='nearest')
-    elif extrap_method == 'const':
-        if const is None:
+    if extrap == 'nearest':
+        return interpolate.interp1d(x, y, fill_value='extrapolate', kind='nearest')
+    elif extrap == 'const':
+        if fill_value is None:
             raise ValueError('Must provide the const kwarg when extrap_method="const"')
-        return interpolate.interp1d(x_input, y_output, fill_value=const, bounds_error=False)
-    elif extrap_method == 'linear':
-        return interpolate.interp1d(x_input, y_output, fill_value='extrapolate')
-    elif extrap_method == 'average':
-        return interpolate.interp1d(x_input, y_output, fill_value=np.mean(y_output), bounds_error=False)
-    elif extrap_method == 'max' or extrap_method == 'maximum':
-        return interpolate.interp1d(x_input, y_output, fill_value=np.max(y_output), bounds_error=False)
-    elif extrap_method == 'min' or extrap_method == 'minimum':
-        return interpolate.interp1d(x_input, y_output, fill_value=np.min(y_output), bounds_error=False)
+        return interpolate.interp1d(x, y, fill_value=fill_value, bounds_error=False)
+    elif extrap == 'linear':
+        return interpolate.interp1d(x, y, fill_value='extrapolate')
+    elif extrap == 'average':
+        return interpolate.interp1d(x, y, fill_value=np.mean(y), bounds_error=False)
+    elif extrap == 'max' or extrap == 'maximum':
+        return interpolate.interp1d(x, y, fill_value=np.max(y), bounds_error=False)
+    elif extrap == 'min' or extrap == 'minimum':
+        return interpolate.interp1d(x, y, fill_value=np.min(y), bounds_error=False)
     else:
         raise ValueError('Invalid extrapolation method provided')
 
 
+def _solve_gumbel1(std, xbar, rp):
+    """
+    Solves the Gumbel Type I pdf = exp(-exp(-b))
+
+    Args:
+        std: standard deviation of the dataset
+        xbar: mean of the dataset
+        rp: return period to calculate in years
+
+    Returns:
+        float: discharge value
+    """
+    return -np.log(-np.log(1 - (1 / rp))) * std * .7797 + xbar - (.45 * std)
+
+
+def _fit_extreme_values_to_gumbel(q_adjust: np.array, p_exceed: np.array, fit_range: tuple = None) -> np.array:
+    """
+    Replace the extreme values from the corrected data with values based on the gumbel distribution
+
+    Args:
+        q_adjust: adjusted flows to be refined
+        p_exceed: exceedance probabilities of the adjusted flows
+        fit_range: range of exceedance probabilities to fit to the Gumbel distribution
+
+    Returns:
+        np.array of the flows with the extreme values replaced
+    """
+    all_values = pd.DataFrame(np.transpose([q_adjust, p_exceed]), columns=('q', 'p'))
+    # compute the average and standard deviation for the values within the user specified fit_range
+    mid_vals = all_values.copy()
+    mid_vals = mid_vals[np.logical_and(mid_vals['p'] > fit_range[0], mid_vals['p'] < fit_range[1])]
+    xbar = statistics.mean(mid_vals['q'].values)
+    std = statistics.stdev(mid_vals['q'].values, xbar)
+
+    # todo check that this is correct
+    q = []
+    for p in mid_vals[mid_vals['p'] <= fit_range[0]]['p'].tolist():
+        q.append(_solve_gumbel1(std, xbar, 1 / (1 - (p / 100))))
+    mid_vals.loc[mid_vals['p'] <= fit_range[0], 'q'] = q
+
+    q = []
+    for p in mid_vals[mid_vals['p'] >= fit_range[1]]['p'].tolist():
+        if p >= 100:
+            p = 99.999
+        q.append(_solve_gumbel1(std, xbar, 1 / (1 - (p / 100))))
+    mid_vals.loc[mid_vals['p'] >= fit_range[1], 'q'] = q
+
+    qb_adjusted = mid_vals['q'].values
+    return qb_adjusted
+
+
 def calibrate_region(workdir: str, assign_table: pd.DataFrame,
                      gauge_table: pd.DataFrame = None, obs_data_dir: str = None) -> None:
     """
diff --git a/saber/_utils.py b/saber/_utils.py
deleted file mode 100644
index 3732dc1..0000000
--- a/saber/_utils.py
+++ /dev/null
@@ -1,57 +0,0 @@
-import math
-
-import hydrostats as hs
-import hydrostats.data as hd
-import numpy as np
-import pandas as pd
-
-
-def solve_gumbel1(std, xbar, rp):
-    """
-    Solves the Gumbel Type I pdf = exp(-exp(-b))
-    where b is the covariate
-    """
-    # xbar = statistics.mean(year_max_flow_list)
-    # std = statistics.stdev(year_max_flow_list, xbar=xbar)
-    return -math.log(-math.log(1 - (1 / rp))) * std * .7797 + xbar - (.45 * std)
-
-
-def statistics_tables(corrected: pd.DataFrame, simulated: pd.DataFrame, observed: pd.DataFrame) -> pd.DataFrame:
-    # merge the datasets together
-    merged_sim_obs = hd.merge_data(sim_df=simulated, obs_df=observed)
-    merged_cor_obs = hd.merge_data(sim_df=corrected, obs_df=observed)
-
-    metrics = ['ME', 'RMSE', 'NRMSE (Mean)', 'MAPE', 'NSE', 'KGE (2009)', 'KGE (2012)']
-    # Merge Data
-    table1 = hs.make_table(merged_dataframe=merged_sim_obs, metrics=metrics)
-    table2 = hs.make_table(merged_dataframe=merged_cor_obs, metrics=metrics)
-
-    table2 = table2.rename(index={'Full Time Series': 'Corrected Full Time Series'})
-    table1 = table1.rename(index={'Full Time Series': 'Original Full Time Series'})
-    table1 = table1.transpose()
-    table2 = table2.transpose()
-
-    return pd.merge(table1, table2, right_index=True, left_index=True)
-
-
-def compute_fdc(flows: np.array, steps: int = 500, exceed: bool = True, col_name: str = 'flow'):
-    percentiles = [round((1 / steps) * i * 100, 5) for i in range(steps + 1)]
-    flows = np.nanpercentile(flows, percentiles)
-    if exceed:
-        percentiles.reverse()
-    return pd.DataFrame(flows, columns=[col_name, ], index=percentiles)
-
-
-def compute_scalar_fdc(sim_fdc: pd.DataFrame, obs_fdc: pd.DataFrame):
-    sim_fdc = compute_fdc(sim_fdc)
-    obs_fdc = compute_fdc(obs_fdc)
-    ratios = np.divide(sim_fdc['flow'].values.flatten(), obs_fdc['flow'].values.flatten())
-    columns = (sim_fdc.columns[0], 'p_exceed', 'scalars')
-    scalars_df = pd.DataFrame(
-        np.transpose([sim_fdc.values[:, 0], sim_fdc['p_exceed'].values.flatten(), ratios]),
-        columns=columns
-    )
-    scalars_df.replace(np.inf, np.nan, inplace=True)
-    scalars_df.dropna(inplace=True)
-
-    return scalars_df

From d43f38d80f657a2615ef4129ac433b4daf389fd6 Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Mon, 29 Aug 2022 15:24:21 -0600
Subject: [PATCH 23/24] rename for pypi

---
 setup.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 6ef3807..0453132 100644
--- a/setup.py
+++ b/setup.py
@@ -6,12 +6,12 @@
 with open('requirements.txt', 'r') as req:
     install_requires = req.read().splitlines()
 
-description = 'tools for hydrological bias correction on large models'
+description = 'tools for bias correction on large hydrological models'
 version = '0.4.0'
 
 setup(
-    name='saber',
-    packages=['saber'],
+    name='hydrosaber',
+    packages=['saber', ],
     version=version,
     description=description,
     long_description=long_description,

From 79e9df99b2cd37ab06b02f492f37589139f3a40c Mon Sep 17 00:00:00 2001
From: rileyhales <rileyhales1@gmail.com>
Date: Mon, 29 Aug 2022 15:26:51 -0600
Subject: [PATCH 24/24] remove python from requirements.txt

---
 requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 1418fb2..16193a7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,3 @@
-python=3
 contextily
 geopandas
 hydrostats