From 3fcf4ace44ea14a2426da02f1a4012105910ce37 Mon Sep 17 00:00:00 2001 From: Riley Hales Date: Sun, 14 Aug 2022 23:49:29 -0600 Subject: [PATCH 01/24] use parquet, streamline functions --- README.md | 8 +-- conda-environment.yml | 15 +++++ examples/example_script.py | 2 +- saber/__init__.py | 6 +- saber/_calibrate.py | 118 +++++++++++++++++++++---------------- saber/_vocab.py | 28 +++++++-- saber/_workflow.py | 5 +- saber/analysis.py | 2 +- saber/cluster.py | 11 ++-- saber/gis.py | 47 +++++++++------ saber/prep.py | 98 +++++++++++++++++------------- saber/utils.py | 15 +++-- 12 files changed, 219 insertions(+), 136 deletions(-) create mode 100644 conda-environment.yml diff --git a/README.md b/README.md index 42d10cc..dfde9bd 100755 --- a/README.md +++ b/README.md @@ -238,17 +238,17 @@ import saber as saber workdir = '/path/to/working/directory' -saber.prep.historical_simulation( +saber.prep.hindcast( workdir, - '/path/to/historical/simulation/netcdf.nc' # optional - if nc not stored in data_inputs folder + '/path/to/historical/simulation/netcdf.nc' # optional - if nc not stored in data_inputs folder ) saber.prep.hist_sim_table( workdir, - '/path/to/historical/simulation/netcdf.nc' # optional - if nc not stored in data_inputs folder + '/path/to/historical/simulation/netcdf.nc' # optional - if nc not stored in data_inputs folder ) saber.prep.observed_data( workdir, - '/path/to/obs/csv/directory' # optional - if csvs not stored in workdir/data_inputs/obs_csvs + '/path/to/obs/csv/directory' # optional - if csvs not stored in workdir/data_inputs/obs_csvs ) ``` diff --git a/conda-environment.yml b/conda-environment.yml new file mode 100644 index 0000000..ff491bd --- /dev/null +++ b/conda-environment.yml @@ -0,0 +1,15 @@ +name: saber +channels: + - conda-forge + - defaults +dependencies: + - python=3 + - geopandas + - pandas + - numpy + - contextily + - xarray + - netCDF4 + - tslearn + - pyarrow + - kneed diff --git a/examples/example_script.py b/examples/example_script.py index bedb6a4..b114050 100644 --- a/examples/example_script.py +++ b/examples/example_script.py @@ -30,7 +30,7 @@ # Prepare the observation and simulation data - Only need to do this step 1x ever print('Preparing data') -saber.prep.historical_simulation(workdir) +saber.prep.hindcast(workdir) # Generate the assignments table print('Generate Assignment Table') diff --git a/saber/__init__.py b/saber/__init__.py index 6b24826..8d13a4f 100644 --- a/saber/__init__.py +++ b/saber/__init__.py @@ -1,5 +1,5 @@ from saber._workflow import prep_region, analyze_region -from saber._calibrate import calibrate_stream, calibrate_region +from saber._calibrate import calibrate, calibrate_region import saber.table import saber.prep @@ -10,9 +10,11 @@ import saber.validate import saber.analysis +import saber._vocab + __all__ = ['prep_region', 'analyze_region', - 'calibrate_stream', 'calibrate_region', + 'calibrate', 'calibrate_region', 'table', 'prep', 'assign', 'gis', 'cluster', 'utils', 'validate', 'analysis'] __author__ = 'Riley Hales' __version__ = '0.3.0' diff --git a/saber/_calibrate.py b/saber/_calibrate.py index 2172509..7ead055 100644 --- a/saber/_calibrate.py +++ b/saber/_calibrate.py @@ -15,48 +15,67 @@ from ._vocab import asgn_gid_col from ._vocab import metric_list from ._vocab import metric_nc_name_list -from ._vocab import sim_ts_pickle +from ._vocab import hindcast_archive from ._vocab import cal_nc_name -def calibrate_stream(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd.DataFrame, - fix_seasonally: bool = True, - drop_outliers: bool = False, outlier_threshold: int or float = 2.5, - filter_scalar_fdc: bool = False, filter_range: tuple = (0, 80), - extrapolate_method: str = 'nearest', fill_value: int or float = None, - fit_gumbel: bool = False, gumbel_range: tuple = (25, 75), ) -> pd.DataFrame: +def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd.DataFrame = None, + fix_seasonally: bool = True, empty_months: str = 'skip', + drop_outliers: bool = False, outlier_threshold: int or float = 2.5, + filter_scalar_fdc: bool = False, filter_range: tuple = (0, 80), + extrapolate_method: str = 'nearest', fill_value: int or float = None, + fit_gumbel: bool = False, gumbel_range: tuple = (25, 75), ) -> pd.DataFrame: """ - Given the simulated and observed stream flow at location a, attempts to the remove the bias from simulated - stream flow at point b. This + Removes the bias from simulated discharge using the SABER method. + + Given simulated and observed discharge at location A, removes bias from simulated data at point A. + Given simulated and observed discharge at location A, removes bias from simulated data at point B, if given B. Args: - sim_flow_a (pd.DataFrame): simulated hydrograph at point A - obs_flow_a (pd.DataFrame): observed hydrograph at point A - sim_flow_b (pd.DataFrame): simulated hydrograph at point B + sim_flow_a (pd.DataFrame): simulated hydrograph at point A. should contain a datetime index with daily values + and a single column of discharge values. + obs_flow_a (pd.DataFrame): observed hydrograph at point A. should contain a datetime index with daily values + and a single column of discharge values. + sim_flow_b (pd.DataFrame): (optional) simulated hydrograph at point B to correct using scalar flow duration + curve mapping and the bias relationship at point A. should contain a datetime index with daily values + and a single column of discharge values. fix_seasonally (bool): fix on a monthly (True) or annual (False) basis + empty_months (str): how to handle months in the simulated data where no observed data are available. Options: + "skip": ignore simulated data for months without drop_outliers (bool): flag to exclude outliers outlier_threshold (int or float): number of std deviations from mean to exclude from flow duration curve filter_scalar_fdc (bool): filter_range (tuple): - extrapolate_method (bool): + extrapolate_method (str): fill_value (int or float): fit_gumbel (bool): gumbel_range (tuple): Returns: - pd.DataFrame of the + pd.DataFrame with a DateTime index and columns with corrected flow, uncorrected flow, the scalar adjustment + factor applied to correct the discharge, and the percentile of the uncorrected flow (in the seasonal grouping, + if applicable). """ + if sim_flow_b is None: + sim_flow_b = sim_flow_a.copy() if fix_seasonally: # list of the unique months in the historical simulation. should always be 1->12 but just in case... monthly_results = [] for month in sorted(set(sim_flow_a.index.strftime('%m'))): - # filter data to only be current iteration's month - mon_sim_data = sim_flow_a[sim_flow_a.index.month == int(month)].dropna() + # filter data to current iteration's month mon_obs_data = obs_flow_a[obs_flow_a.index.month == int(month)].dropna() + + if mon_obs_data.empty: + if empty_months == 'skip': + continue + else: + raise ValueError(f'Invalid value for argument "empty_months". Given: {empty_months}.') + + mon_sim_data = sim_flow_a[sim_flow_a.index.month == int(month)].dropna() mon_cor_data = sim_flow_b[sim_flow_b.index.month == int(month)].dropna() - monthly_results.append(calibrate_stream( + monthly_results.append(calibrate( mon_sim_data, mon_obs_data, mon_cor_data, - fix_seasonally=False, + fix_seasonally=False, empty_months=empty_months, drop_outliers=drop_outliers, outlier_threshold=outlier_threshold, filter_scalar_fdc=filter_scalar_fdc, filter_range=filter_range, extrapolate_method=extrapolate_method, fill_value=fill_value, @@ -67,7 +86,7 @@ def calibrate_stream(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flo # compute the fdc for paired sim/obs data and compute scalar fdc, either with or without outliers if drop_outliers: - # drop outlier data + # drop outlier data before creating flow duration curves # https://stackoverflow.com/questions/23199796/detect-and-exclude-outliers-in-pandas-data-frame sim_fdc = compute_fdc( sim_flow_a[(np.abs(stats.zscore(sim_flow_a)) < outlier_threshold).all(axis=1)]) @@ -77,39 +96,16 @@ def calibrate_stream(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flo sim_fdc = compute_fdc(sim_flow_a) obs_fdc = compute_fdc(obs_flow_a) - scalar_fdc = compute_scalar_fdc(obs_fdc['flow'].values.flatten(), sim_fdc['flow'].values.flatten()) + scalar_fdc = compute_scalar_fdc(sim_fdc['flow'].values.flatten(), obs_fdc['flow'].values.flatten()) if filter_scalar_fdc: - scalar_fdc = scalar_fdc[scalar_fdc['Exceedance Probability'] >= filter_range[0]] - scalar_fdc = scalar_fdc[scalar_fdc['Exceedance Probability'] <= filter_range[1]] - - # Convert the percentiles - if extrapolate_method == 'nearest': - to_scalar = interpolate.interp1d(scalar_fdc.values[:, 0], scalar_fdc.values[:, 1], - fill_value='extrapolate', kind='nearest') - elif extrapolate_method == 'value': - to_scalar = interpolate.interp1d(scalar_fdc.values[:, 0], scalar_fdc.values[:, 1], - fill_value=fill_value, bounds_error=False) - elif extrapolate_method == 'linear': - to_scalar = interpolate.interp1d(scalar_fdc.values[:, 0], scalar_fdc.values[:, 1], - fill_value='extrapolate') - elif extrapolate_method == 'average': - to_scalar = interpolate.interp1d(scalar_fdc.values[:, 0], scalar_fdc.values[:, 1], - fill_value=np.mean(scalar_fdc.values[:, 1]), bounds_error=False) - elif extrapolate_method == 'max' or extrapolate_method == 'maximum': - to_scalar = interpolate.interp1d(scalar_fdc.values[:, 0], scalar_fdc.values[:, 1], - fill_value=np.max(scalar_fdc.values[:, 1]), bounds_error=False) - elif extrapolate_method == 'min' or extrapolate_method == 'minimum': - to_scalar = interpolate.interp1d(scalar_fdc.values[:, 0], scalar_fdc.values[:, 1], - fill_value=np.min(scalar_fdc.values[:, 1]), bounds_error=False) - else: - raise ValueError('Invalid extrapolation method provided') + scalar_fdc = scalar_fdc[scalar_fdc['p_exceed'].between(filter_range[0], filter_range[1])] # determine the percentile of each uncorrected flow using the monthly fdc values = sim_flow_b.values.flatten() percentiles = [stats.percentileofscore(values, a) for a in values] scalars = to_scalar(percentiles) - values = values * scalars + values = values / scalars if fit_gumbel: tmp = pd.DataFrame(np.transpose([values, percentiles]), columns=('q', 'p')) @@ -134,15 +130,37 @@ def calibrate_stream(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flo values = tmp['q'].values - return pd.DataFrame(data=np.transpose([values, scalars, percentiles]), + return pd.DataFrame(data=np.transpose([values, sim_flow_b.values.flatten(), scalars, percentiles]), index=sim_flow_b.index.to_list(), - columns=('flow', 'scalars', 'percentile')) + columns=('flow', 'uncorrected', 'scalars', 'percentile')) + + +def _make_interpolator(x_input: np.array, y_output: np.array, extrap_method: str = 'nearest', + const: int or float = None) -> interpolate.interp1d: + # make interpolator which converts the percentiles to scalars + if extrap_method == 'nearest': + interpolator = interpolate.interp1d(x_input, y_output, fill_value='extrapolate', kind='nearest') + elif extrap_method == 'const': + if const is None: + raise ValueError('Must provide the const kwarg when extrap_method="const"') + interpolator = interpolate.interp1d(x_input, y_output, fill_value=const, bounds_error=False) + elif extrap_method == 'linear': + interpolator = interpolate.interp1d(x_input, y_output, fill_value='extrapolate') + elif extrap_method == 'average': + interpolator = interpolate.interp1d(x_input, y_output, fill_value=np.mean(y_output), bounds_error=False) + elif extrap_method == 'max' or extrap_method == 'maximum': + interpolator = interpolate.interp1d(x_input, y_output, fill_value=np.max(y_output), bounds_error=False) + elif extrap_method == 'min' or extrap_method == 'minimum': + interpolator = interpolate.interp1d(x_input, y_output, fill_value=np.min(y_output), bounds_error=False) + else: + raise ValueError('Invalid extrapolation method provided') + return interpolator def calibrate_region(workdir: str, assign_table: pd.DataFrame, gauge_table: pd.DataFrame = None, obs_data_dir: str = None) -> None: """ - Creates a netCDF with the + Creates a netCDF of all corrected flows in a region. Args: workdir: path to project working directory @@ -159,7 +177,7 @@ def calibrate_region(workdir: str, assign_table: pd.DataFrame, obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs') bcs_nc_path = os.path.join(workdir, cal_nc_name) - ts = pd.read_pickle(os.path.join(workdir, 'data_processed', sim_ts_pickle)) + ts = pd.read_pickle(os.path.join(workdir, 'data_processed', hindcast_archive)) # create the new netcdf bcs_nc = nc.Dataset(bcs_nc_path, 'w') @@ -238,7 +256,7 @@ def calibrate_region(workdir: str, assign_table: pd.DataFrame, continue try: - calibrated_df = calibrate_stream(ts[[asgn_mid]], obs_df, ts[[model_id]], ) + calibrated_df = calibrate(ts[[asgn_mid]], obs_df, ts[[model_id]], ) c_array[:, idx] = calibrated_df['flow'].values p_array[:, idx] = calibrated_df['percentile'].values s_array[:, idx] = calibrated_df['scalars'].values diff --git a/saber/_vocab.py b/saber/_vocab.py index 29a077d..dcde03e 100644 --- a/saber/_vocab.py +++ b/saber/_vocab.py @@ -1,5 +1,10 @@ +import os +import glob +import pandas as pd + # assign table and gis_input file required column names -mid_col = 'model_id' +# mid_col = 'model_id' +mid_col = 'COMID' gid_col = 'gauge_id' asgn_mid_col = 'assigned_model_id' asgn_gid_col = 'assigned_gauge_id' @@ -8,11 +13,26 @@ area_col = 'drain_area' order_col = 'stream_order' +# scaffolded folders +gis_inp = 'gis_inputs' +gis_out = 'gis_outputs' + # name of some files produced by the algorithm cluster_count_file = 'best-fit-cluster-count.json' cal_nc_name = 'calibrated_simulated_flow.nc' -sim_ts_pickle = 'sim_time_series.pickle' +hindcast_archive = 'hindcast.parquet' +hindcast_fdc_archive = 'hindcast_fdc.parquet' +drain_table = 'drain_table.parquet' +gauge_table = 'gauge_table.parquet' # metrics computed on validation sets -metric_list = ['ME', 'MAE', 'RMSE', 'NRMSE (Mean)', 'MAPE', 'NSE', 'KGE (2012)'] -metric_nc_name_list = ['ME', 'MAE', 'RMSE', 'NRMSE', 'MAPE', 'NSE', 'KGE2012'] +metric_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE (2012)'] +metric_nc_name_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE2012'] + + +def read_drain_table(workdir: str) -> pd.DataFrame: + return pd.read_parquet(os.path.join(workdir, gis_inp, drain_table)) + + +def guess_hindcast_path(workdir: str) -> str: + return glob.glob(os.path.join(workdir, 'data_inputs', '*.nc*'))[0] diff --git a/saber/_workflow.py b/saber/_workflow.py index d313aba..0a97402 100644 --- a/saber/_workflow.py +++ b/saber/_workflow.py @@ -1,6 +1,6 @@ import pandas as pd -from .prep import historical_simulation +from .prep import hindcast from .table import gen, cache from .cluster import generate, summarize @@ -10,8 +10,7 @@ def prep_region(workdir: str) -> None: - historical_simulation(workdir) - # observed_data(workdir) + hindcast(workdir) return diff --git a/saber/analysis.py b/saber/analysis.py index b6ada4a..56a0f40 100644 --- a/saber/analysis.py +++ b/saber/analysis.py @@ -1,6 +1,6 @@ import os -import grids +# import grids import pandas as pd import plotly.graph_objects as go diff --git a/saber/cluster.py b/saber/cluster.py index 122e78b..cf6f5b7 100755 --- a/saber/cluster.py +++ b/saber/cluster.py @@ -17,7 +17,7 @@ def generate(workdir: str) -> None: """ - Creates trained kmeans model pickle files and plots of the results saved as png images + Trains DTW kmeans model pickle files and plots of the results saved as png images Args: workdir: path to the project directory @@ -27,9 +27,9 @@ def generate(workdir: str) -> None: """ best_fit = {} - for table in glob.glob(os.path.join(workdir, 'data_processed', '*.csv')): + for table in glob.glob(os.path.join(workdir, 'data_processed', '*_fdc_*.parquet.gzip')): # read the data and transform - time_series = pd.read_csv(table, index_col=0).dropna(axis=1) + time_series = pd.read_parquet(table) time_series = np.transpose(time_series.values) time_series = TimeSeriesScalerMeanVariance().fit_transform(time_series) @@ -37,7 +37,7 @@ def generate(workdir: str) -> None: dataset = os.path.splitext(os.path.basename(table))[0] inertia = {'number': [], 'inertia': []} - for num_cluster in range(2, 11): + for num_cluster in range(2, 16): # build the kmeans model km = TimeSeriesKMeans(n_clusters=num_cluster, random_state=0) km.fit_predict(time_series) @@ -50,6 +50,7 @@ def generate(workdir: str) -> None: # generate a plot of the clusters size = time_series.shape[1] fig = plt.figure(figsize=(30, 15), dpi=450) + fig.suptitle("Dynamic Time Warping Clustering") assigned_clusters = km.labels_ for i in range(num_cluster): plt.subplot(2, math.ceil(num_cluster / 2), i + 1) @@ -59,8 +60,6 @@ def generate(workdir: str) -> None: plt.xlim(0, size) plt.ylim(-3.5, 3.5) plt.text(0.55, 0.85, f'Cluster {i}', transform=plt.gca().transAxes) - if i == math.floor(num_cluster / 4): - plt.title("Euclidean $k$-means") plt.tight_layout() fig.savefig(os.path.join(workdir, 'kmeans_images', f'{dataset}-{num_cluster}-clusters.png')) plt.close(fig) diff --git a/saber/gis.py b/saber/gis.py index 5c494e8..40a2759 100644 --- a/saber/gis.py +++ b/saber/gis.py @@ -43,7 +43,7 @@ def generate_all(workdir: str, assign_table: pd.DataFrame, drain_shape: str, pre def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '', id_column: str = mid_col) -> None: """ - Creates geojsons (in workdir/gis_outputs) for each unique value in the assignment column + Creates Geopackage files in workdir/gis_outputs for each unique value in the assignment column Args: workdir: the path to the working directory for the project @@ -63,18 +63,18 @@ def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: st for reason in set(assign_table[reason_col].dropna().values): ids = assign_table[assign_table[reason_col] == reason][mid_col].values subset = dl[dl[id_column].isin(ids)] - name = f'{prefix}{"_" if prefix else ""}assignments_{reason}.json' + name = f'{prefix}{"_" if prefix else ""}assignments_{reason}.gpkg' if subset.empty: continue else: - subset.to_file(os.path.join(save_dir, name), driver='GeoJSON') + subset.to_file(os.path.join(save_dir, name)) return def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '', id_column: str = mid_col) -> None: """ - Creates GIS files (in workdir/gis_outputs) of the drainage lines based on which fdc cluster they were assigned to + Creates Geopackage files in workdir/gis_outputs of the drainage lines based on the fdc cluster they were assigned to Args: workdir: the path to the working directory for the project @@ -90,19 +90,19 @@ def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_shape: str, cluster_types = [a for a in assign_table if 'cluster' in a] for ctype in cluster_types: for gnum in sorted(set(assign_table[ctype].dropna().values)): - savepath = os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}{ctype}-{int(gnum)}.json') + savepath = os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}{ctype}-{int(gnum)}.gpkg') ids = assign_table[assign_table[ctype] == gnum][mid_col].values if dl_gdf[dl_gdf[id_column].isin(ids)].empty: continue else: - dl_gdf[dl_gdf[id_column].isin(ids)].to_file(savepath, driver='GeoJSON') + dl_gdf[dl_gdf[id_column].isin(ids)].to_file(savepath) return def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '', id_column: str = mid_col) -> None: """ - Creates geojsons (in workdir/gis_outputs) of the drainage lines which haven't been assigned a gauge yet + Creates Geopackage files in workdir/gis_outputs of the drainage lines which haven't been assigned a gauge yet Args: workdir: the path to the working directory for the project @@ -120,15 +120,15 @@ def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_shape: st if subset.empty: warnings.warn('Empty filter: No streams are unassigned') return - savepath = os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}assignments_unassigned.json') - subset.to_file(savepath, driver='GeoJSON') + savepath = os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}assignments_unassigned.gpkg') + subset.to_file(savepath) return def clip_by_ids(workdir: str, ids: list, drain_shape: str, prefix: str = '', id_column: str = mid_col) -> None: """ - Creates geojsons (in workdir/gis_outputs) of the subset of 'drain_shape' with an ID in the specified list + Creates Geopackage files in workdir/gis_outputs of the subset of 'drain_shape' with an ID in the specified list Args: workdir: path to the project directory @@ -142,14 +142,14 @@ def clip_by_ids(workdir: str, ids: list, drain_shape: str, prefix: str = '', """ dl = gpd.read_file(drain_shape) save_dir = os.path.join(workdir, 'gis_outputs') - name = f'{prefix}{"_" if prefix else ""}id_subset.json' - dl[dl[id_column].isin(ids)].to_file(os.path.join(save_dir, name), driver='GeoJSON') + name = f'{prefix}{"_" if prefix else ""}id_subset.gpkg' + dl[dl[id_column].isin(ids)].to_file(os.path.join(save_dir, name)) return def validation_maps(workdir: str, gauge_shape: str, val_table: pd.DataFrame = None, prefix: str = '') -> None: """ - Creates geojsons (in workdir/gis_outputs) of subsets of the gauge_shape. + Creates Geopackage files in workdir/gis_outputs of subsets of the gauge_shape. 1 is the fill gauge shape with added attribute columns for all the computed stats. There are 2 for each of the 5 validation groups; 1 which shows the gauges included in the validation set and 1 which shows gauges that were excluded from the validation set. @@ -170,7 +170,7 @@ def validation_maps(workdir: str, gauge_shape: str, val_table: pd.DataFrame = No # merge gauge table with the validation table gdf = gpd.read_file(gauge_shape) gdf = gdf.merge(val_table, on=gid_col, how='inner') - gdf.to_file(os.path.join(save_dir, 'gauges_with_validation_stats.json'), driver='GeoJSON') + gdf.to_file(os.path.join(save_dir, 'gauges_with_validation_stats.gpkg')) core_columns = [mid_col, gid_col, 'geometry'] @@ -182,12 +182,12 @@ def validation_maps(workdir: str, gauge_shape: str, val_table: pd.DataFrame = No gdf_sub = gdf[cols_to_select] gdf_sub = gdf_sub.rename(columns={f'{metric}_{val_set}': metric}) - name = f'{prefix}{"_" if prefix else ""}valset_{val_set}_{metric}_included.json' - gdf_sub[gdf_sub[val_set] == 1].to_file(os.path.join(save_dir, name), driver='GeoJSON') + name = f'{prefix}{"_" if prefix else ""}valset_{val_set}_{metric}_included.gpkg' + gdf_sub[gdf_sub[val_set] == 1].to_file(os.path.join(save_dir, name)) - name = f'{prefix}{"_" if prefix else ""}valset_{val_set}_{metric}_excluded.json' + name = f'{prefix}{"_" if prefix else ""}valset_{val_set}_{metric}_excluded.gpkg' exc = gdf_sub[gdf_sub[val_set] == 0] - exc.to_file(os.path.join(save_dir, name), driver='GeoJSON') + exc.to_file(os.path.join(save_dir, name)) if metric == 'KGE2012': histomaps(exc, metric, val_set, workdir) @@ -195,6 +195,17 @@ def validation_maps(workdir: str, gauge_shape: str, val_table: pd.DataFrame = No def histomaps(gdf: gpd.GeoDataFrame, metric: str, prct: str, workdir: str) -> None: + """ + + Args: + gdf: + metric: + prct: + workdir: + + Returns: + + """ core_columns = [mid_col, gid_col, 'geometry'] # world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres')) # world.plot(ax=axm, color='white', edgecolor='black') diff --git a/saber/prep.py b/saber/prep.py index f08f795..3d09c6a 100755 --- a/saber/prep.py +++ b/saber/prep.py @@ -1,76 +1,92 @@ -import glob import os import grids import pandas as pd -import xarray as xr +import geopandas as gpd +import netCDF4 as nc +import numpy as np from .utils import compute_fdc from ._vocab import mid_col +from ._vocab import hindcast_archive +from ._vocab import hindcast_fdc_archive +from ._vocab import read_drain_table +from ._vocab import guess_hindcast_path +__all__ = ['gis_tables', 'hindcast', 'scaffold_workdir'] -def guess_hist_sim_path(workdir: str) -> str: - return glob.glob(os.path.join(workdir, 'data_inputs', '*.nc*'))[0] + +def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> None: + """ + Generate copies of the drainage line attribute tables in parquet format using the Saber package vocabulary + + Args: + workdir: path to the working directory for the project + gauge_gis: path to the GIS dataset (e.g. geopackage) for the gauge locations (points) + drain_gis: path to the GIS dataset (e.g. geopackage) for the drainage line locations (polylines) + + Returns: + None + """ + if gauge_gis is not None: + gdf = gpd.read_file(gauge_gis).drop('geometry', axis=1) + gdf[['x', 'y']] = gdf.geometry.apply(lambda x: [x.x, x.y]) + pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet( + os.path.join(workdir, 'gis_inputs', 'gauge_table.parquet')) + if drain_gis is not None: + gdf = gpd.read_file(drain_gis).drop('geometry', axis=1) + gdf[['x', 'y']] = gdf.geometry.apply(lambda x: [x.x, x.y]) + pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet( + os.path.join(workdir, 'gis_inputs', 'drain_table.parquet')) + return -def historical_simulation(workdir: str, hist_nc_path: str = None) -> None: +def hindcast(workdir: str, hind_nc_path: str = None, ) -> None: """ - Creates sim-fdc.csv and sim_time_series.pickle in workdir/data_processed for geoglows historical simulation data + Creates sim-fdc.parquet and sim_time_series.parquet in workdir/data_processed for geoglows historical simulation data Args: workdir: path to the working directory for the project - hist_nc_path: path to the historical simulation netcdf if not located at workdir/data_simulated/*.nc + hind_nc_path: path to the hindcast or historical simulation netcdf if not located at workdir/data_simulated/*.nc Returns: None """ - if hist_nc_path is None: - hist_nc_path = guess_hist_sim_path(workdir) + if hind_nc_path is None: + hind_nc_path = guess_hindcast_path(workdir) # read the assignments table - drain_table = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'drain_table.csv')) + drain_table = read_drain_table(workdir) model_ids = list(set(sorted(drain_table[mid_col].tolist()))) - # open the historical data netcdf file - hist_nc = xr.open_dataset(hist_nc_path) - - # start dataframes for the flow duration curve (fdc) and the monthly averages (ma) using the first comid in the list - first_id = model_ids.pop(0) - first_data = hist_nc.sel(rivid=first_id).Qout.to_dataframe()['Qout'] - fdc_df = compute_fdc(first_data.tolist(), col_name=first_id) - # ma_df = first_data.groupby(first_data.index.strftime('%m')).mean().to_frame(name=first_id) - - # for each remaining stream ID in the list, merge/append the fdc and ma with the previously created dataframes - for model_id in model_ids: - data = hist_nc.sel(rivid=model_id).Qout.to_dataframe()['Qout'] - fdc_df = fdc_df.merge(compute_fdc(data.tolist(), col_name=model_id), - how='outer', left_index=True, right_index=True) - - fdc_df.to_csv(os.path.join(workdir, 'data_processed', 'sim-fdc.csv')) - - # create the time series table of simulated flow values - coords = drain_table[[mid_col]] - coords.loc[:, 'time'] = None - coords = coords[['time', 'model_id']].values.tolist() - ts = grids.TimeSeries([hist_nc_path, ], 'Qout', ('time', 'rivid')) - ts = ts.multipoint(*coords) - ts.set_index('datetime', inplace=True) - ts.index = pd.to_datetime(ts.index, unit='s') - ts.columns = drain_table[mid_col].values.flatten() - ts.index = ts.index.tz_localize('UTC') - ts.to_pickle(os.path.join(workdir, 'data_processed', 'sim_time_series.pickle') -) + # read the hindcast netcdf and convert to dataframe + hnc = nc.Dataset(hind_nc_path) + ids = pd.Series(hnc['rivid'][:]) + ids_selector = ids.isin(model_ids) + df = pd.DataFrame( + hnc['Qout'][:, ids_selector], + columns=ids[ids_selector].astype(str), + index=pd.to_datetime(hnc.variables['time'][:], unit='s') + ) + df.index.name = 'datetime' + df.to_parquet(os.path.join(workdir, 'data_processed', 'hindcast_series_table.parquet.gzip'), compression='gzip') + + exceed_prob = np.linspace(0, 100, 401)[::-1] + df = df.apply(lambda x: np.transpose(np.nanpercentile(x, exceed_prob))) + df.index = exceed_prob + df.index.name = 'exceed_prob' + df.to_parquet(os.path.join(workdir, 'data_processed', 'hindcast_fdc_table.parquet.gzip'), compression='gzip') return def scaffold_workdir(path: str, include_validation: bool = True) -> None: """ - Creates the correct directories for an RBC project within the a specified directory + Creates the correct directories for a Saber project within the specified directory Args: path: the path to a directory where you want to create directories - include_validation: boolean, indicates whether or not to create the validation folder + include_validation: boolean, indicates whether to create the validation folder Returns: None diff --git a/saber/utils.py b/saber/utils.py index 7dd3b80..3732dc1 100644 --- a/saber/utils.py +++ b/saber/utils.py @@ -42,12 +42,15 @@ def compute_fdc(flows: np.array, steps: int = 500, exceed: bool = True, col_name return pd.DataFrame(flows, columns=[col_name, ], index=percentiles) -def compute_scalar_fdc(first_fdc, second_fdc): - first_fdc = compute_fdc(first_fdc) - second_fdc = compute_fdc(second_fdc) - ratios = np.divide(first_fdc['flow'].values.flatten(), second_fdc['flow'].values.flatten()) - columns = (first_fdc.columns[0], 'Scalars') - scalars_df = pd.DataFrame(np.transpose([first_fdc.values[:, 0], ratios]), columns=columns) +def compute_scalar_fdc(sim_fdc: pd.DataFrame, obs_fdc: pd.DataFrame): + sim_fdc = compute_fdc(sim_fdc) + obs_fdc = compute_fdc(obs_fdc) + ratios = np.divide(sim_fdc['flow'].values.flatten(), obs_fdc['flow'].values.flatten()) + columns = (sim_fdc.columns[0], 'p_exceed', 'scalars') + scalars_df = pd.DataFrame( + np.transpose([sim_fdc.values[:, 0], sim_fdc['p_exceed'].values.flatten(), ratios]), + columns=columns + ) scalars_df.replace(np.inf, np.nan, inplace=True) scalars_df.dropna(inplace=True) From 15c837e0e9e499acc005bebb32ec3db9326b1236 Mon Sep 17 00:00:00 2001 From: Riley Hales Date: Mon, 15 Aug 2022 11:57:15 -0600 Subject: [PATCH 02/24] improve paths --- saber/_calibrate.py | 4 ++-- saber/_vocab.py | 32 +++++++++++++++++++++++++------- saber/_workflow.py | 5 ----- saber/prep.py | 23 +++++++++++------------ saber/table.py | 17 ++++++++++------- 5 files changed, 48 insertions(+), 33 deletions(-) diff --git a/saber/_calibrate.py b/saber/_calibrate.py index 7ead055..4b848dd 100644 --- a/saber/_calibrate.py +++ b/saber/_calibrate.py @@ -15,7 +15,7 @@ from ._vocab import asgn_gid_col from ._vocab import metric_list from ._vocab import metric_nc_name_list -from ._vocab import hindcast_archive +from ._vocab import hindcast_table from ._vocab import cal_nc_name @@ -177,7 +177,7 @@ def calibrate_region(workdir: str, assign_table: pd.DataFrame, obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs') bcs_nc_path = os.path.join(workdir, cal_nc_name) - ts = pd.read_pickle(os.path.join(workdir, 'data_processed', hindcast_archive)) + ts = pd.read_pickle(os.path.join(workdir, 'data_processed', hindcast_table)) # create the new netcdf bcs_nc = nc.Dataset(bcs_nc_path, 'w') diff --git a/saber/_vocab.py b/saber/_vocab.py index dcde03e..db471f4 100644 --- a/saber/_vocab.py +++ b/saber/_vocab.py @@ -14,16 +14,18 @@ order_col = 'stream_order' # scaffolded folders -gis_inp = 'gis_inputs' +tables = 'tables' +data_in = 'data_inputs' gis_out = 'gis_outputs' +kmeans_out = 'kmeans_outputs' # name of some files produced by the algorithm cluster_count_file = 'best-fit-cluster-count.json' cal_nc_name = 'calibrated_simulated_flow.nc' -hindcast_archive = 'hindcast.parquet' -hindcast_fdc_archive = 'hindcast_fdc.parquet' -drain_table = 'drain_table.parquet' -gauge_table = 'gauge_table.parquet' +hindcast_table = 'hindcast_series_table.parquet.gzip' +hindcast_fdc_table = 'hindcast_fdc_table.parquet.gzip' +drain_table = 'drain_table.parquet.gzip' +gauge_table = 'gauge_table.parquet.gzip' # metrics computed on validation sets metric_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE (2012)'] @@ -31,8 +33,24 @@ def read_drain_table(workdir: str) -> pd.DataFrame: - return pd.read_parquet(os.path.join(workdir, gis_inp, drain_table)) + return pd.read_parquet(os.path.join(workdir, tables, drain_table)) + + +def read_gauge_table(workdir: str) -> pd.DataFrame: + return pd.read_parquet(os.path.join(workdir, tables, gauge_table)) + + +# todo: make everything in prep and other modules use this function for getting table paths +def get_table_path(workdir: str, table_name: str) -> str: + if table_name == 'hindcast_series': + return os.path.join(workdir, data_in, hindcast_table) + elif table_name == 'hindcast_fdc': + return os.path.join(workdir, data_in, hindcast_fdc_table) + elif table_name == 'drain_table': + return os.path.join(workdir, tables, drain_table) + elif table_name == 'gauge_table': + return os.path.join(workdir, tables, gauge_table) def guess_hindcast_path(workdir: str) -> str: - return glob.glob(os.path.join(workdir, 'data_inputs', '*.nc*'))[0] + return glob.glob(os.path.join(workdir, data_in, '*.nc*'))[0] diff --git a/saber/_workflow.py b/saber/_workflow.py index 0a97402..494b693 100644 --- a/saber/_workflow.py +++ b/saber/_workflow.py @@ -9,11 +9,6 @@ from ._calibrate import calibrate_region -def prep_region(workdir: str) -> None: - hindcast(workdir) - return - - def analyze_region(workdir: str, drain_shape: str, gauge_table: pd.DataFrame = None, obs_data_dir: str = None) -> None: # Generate the assignments table print("gen assign_table") diff --git a/saber/prep.py b/saber/prep.py index 3d09c6a..81478cd 100755 --- a/saber/prep.py +++ b/saber/prep.py @@ -9,8 +9,8 @@ from .utils import compute_fdc from ._vocab import mid_col -from ._vocab import hindcast_archive -from ._vocab import hindcast_fdc_archive +from ._vocab import hindcast_table +from ._vocab import hindcast_fdc_table from ._vocab import read_drain_table from ._vocab import guess_hindcast_path @@ -33,18 +33,19 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No gdf = gpd.read_file(gauge_gis).drop('geometry', axis=1) gdf[['x', 'y']] = gdf.geometry.apply(lambda x: [x.x, x.y]) pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet( - os.path.join(workdir, 'gis_inputs', 'gauge_table.parquet')) + os.path.join(workdir, 'tables', 'gauge_table.parquet.gzip'), compression='gzip') if drain_gis is not None: gdf = gpd.read_file(drain_gis).drop('geometry', axis=1) gdf[['x', 'y']] = gdf.geometry.apply(lambda x: [x.x, x.y]) pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet( - os.path.join(workdir, 'gis_inputs', 'drain_table.parquet')) + os.path.join(workdir, 'tables', 'drain_table.parquet.gzip'), compression='gzip') return def hindcast(workdir: str, hind_nc_path: str = None, ) -> None: """ - Creates sim-fdc.parquet and sim_time_series.parquet in workdir/data_processed for geoglows historical simulation data + Creates hindcast_series_table.parquet.gzip and hindcast_fdc_table.parquet.gzip in the workdir/tables directory + for the GEOGloWS hindcast data Args: workdir: path to the working directory for the project @@ -70,13 +71,13 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None: index=pd.to_datetime(hnc.variables['time'][:], unit='s') ) df.index.name = 'datetime' - df.to_parquet(os.path.join(workdir, 'data_processed', 'hindcast_series_table.parquet.gzip'), compression='gzip') + df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_series_table.parquet.gzip'), compression='gzip') exceed_prob = np.linspace(0, 100, 401)[::-1] df = df.apply(lambda x: np.transpose(np.nanpercentile(x, exceed_prob))) df.index = exceed_prob df.index.name = 'exceed_prob' - df.to_parquet(os.path.join(workdir, 'data_processed', 'hindcast_fdc_table.parquet.gzip'), compression='gzip') + df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_table.parquet.gzip'), compression='gzip') return @@ -93,12 +94,10 @@ def scaffold_workdir(path: str, include_validation: bool = True) -> None: """ if not os.path.isdir(path): os.mkdir(path) - os.mkdir(os.path.join(path, 'data_inputs')) - os.mkdir(os.path.join(path, 'data_processed')) - os.mkdir(os.path.join(path, 'gis_inputs')) + os.mkdir(os.path.join(path, 'tables')) + os.mkdir(os.path.join(path, 'inputs')) os.mkdir(os.path.join(path, 'gis_outputs')) - os.mkdir(os.path.join(path, 'kmeans_models')) - os.mkdir(os.path.join(path, 'kmeans_images')) + os.mkdir(os.path.join(path, 'kmeans_outputs')) if include_validation: os.mkdir(os.path.join(path, 'validation_runs')) return diff --git a/saber/table.py b/saber/table.py index 076da48..01d9c9b 100644 --- a/saber/table.py +++ b/saber/table.py @@ -21,7 +21,7 @@ def get_path(workdir: str) -> str: Returns: absolute path of the assign_table """ - return os.path.join(workdir, 'assign_table.csv') + return os.path.join(workdir, 'assign_table.parquet.gzip') def gen(workdir) -> pd.DataFrame: @@ -35,8 +35,8 @@ def gen(workdir) -> pd.DataFrame: None """ # read and merge the tables - drain_df = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'drain_table.csv')) - gauge_df = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'gauge_table.csv')) + drain_df = pd.read_parquet(os.path.join(workdir, 'tables', 'drain_table.parquet.gzip')) + gauge_df = pd.read_parquet(os.path.join(workdir, 'tables', 'gauge_table.parquet.gzip')) assign_table = pd.merge(drain_df, gauge_df, on=mid_col, how='outer') # create the new columns @@ -44,6 +44,9 @@ def gen(workdir) -> pd.DataFrame: assign_table[asgn_gid_col] = np.nan assign_table[reason_col] = np.nan + # cache the table + cache(workdir, assign_table) + return assign_table @@ -57,13 +60,13 @@ def read(workdir: str) -> pd.DataFrame: Returns: assign_table pandas.DataFrame """ - return pd.read_csv(get_path(workdir)) + return pd.read_parquet(get_path(workdir)) def cache(workdir: str, assign_table: pd.DataFrame) -> None: """ - Saves the pandas dataframe to a csv in the proper place in the project directory - A shortcut for pd.DataFrame.to_csv so you don't have to code it all the time + Saves the pandas dataframe to a parquet in the proper place in the project directory + A shortcut for pd.DataFrame.to_parquet so you don't have to code the paths every where Args: workdir: the project directory path @@ -72,6 +75,6 @@ def cache(workdir: str, assign_table: pd.DataFrame) -> None: Returns: None """ - assign_table.to_csv(get_path(workdir), index=False) + assign_table.to_parquet(get_path(workdir), compression='gzip') return From 8b96df341e9955dcea551d8e5a047863c4a1a14c Mon Sep 17 00:00:00 2001 From: rileyhales Date: Thu, 18 Aug 2022 14:56:42 -0600 Subject: [PATCH 03/24] misc code clean up --- conda-environment.yml | 7 +++++- environment.yml | 16 ------------- saber/__init__.py | 4 ++-- saber/_vocab.py | 5 ++-- saber/analysis.py | 1 - saber/cluster.py | 53 ++++++++++++++++++++++++------------------- saber/gis.py | 2 +- saber/prep.py | 13 ++++------- setup.py | 2 +- 9 files changed, 47 insertions(+), 56 deletions(-) delete mode 100644 environment.yml diff --git a/conda-environment.yml b/conda-environment.yml index ff491bd..ec661ed 100644 --- a/conda-environment.yml +++ b/conda-environment.yml @@ -7,9 +7,14 @@ dependencies: - geopandas - pandas - numpy + - numba - contextily - - xarray - netCDF4 - tslearn - pyarrow - kneed + - tslearn + - matplotlib + - geoglows + - requests + - scipy diff --git a/environment.yml b/environment.yml deleted file mode 100644 index f58ed11..0000000 --- a/environment.yml +++ /dev/null @@ -1,16 +0,0 @@ -name: bias -channels: - - conda-forge -dependencies: - - python>=3 - - pandas - - geopandas - - numpy - - xarray - - tslearn - - matplotlib - - xarray - - geoglows - - requests - - plotly - - scipy \ No newline at end of file diff --git a/saber/__init__.py b/saber/__init__.py index 8d13a4f..3024c69 100644 --- a/saber/__init__.py +++ b/saber/__init__.py @@ -1,4 +1,4 @@ -from saber._workflow import prep_region, analyze_region +from saber._workflow import analyze_region from saber._calibrate import calibrate, calibrate_region import saber.table @@ -13,7 +13,7 @@ import saber._vocab -__all__ = ['prep_region', 'analyze_region', +__all__ = ['analyze_region', 'calibrate', 'calibrate_region', 'table', 'prep', 'assign', 'gis', 'cluster', 'utils', 'validate', 'analysis'] __author__ = 'Riley Hales' diff --git a/saber/_vocab.py b/saber/_vocab.py index db471f4..44c4f3c 100644 --- a/saber/_vocab.py +++ b/saber/_vocab.py @@ -40,12 +40,11 @@ def read_gauge_table(workdir: str) -> pd.DataFrame: return pd.read_parquet(os.path.join(workdir, tables, gauge_table)) -# todo: make everything in prep and other modules use this function for getting table paths def get_table_path(workdir: str, table_name: str) -> str: if table_name == 'hindcast_series': - return os.path.join(workdir, data_in, hindcast_table) + return os.path.join(workdir, tables, hindcast_table) elif table_name == 'hindcast_fdc': - return os.path.join(workdir, data_in, hindcast_fdc_table) + return os.path.join(workdir, tables, hindcast_fdc_table) elif table_name == 'drain_table': return os.path.join(workdir, tables, drain_table) elif table_name == 'gauge_table': diff --git a/saber/analysis.py b/saber/analysis.py index 56a0f40..582fa3d 100644 --- a/saber/analysis.py +++ b/saber/analysis.py @@ -1,6 +1,5 @@ import os -# import grids import pandas as pd import plotly.graph_objects as go diff --git a/saber/cluster.py b/saber/cluster.py index cf6f5b7..2bfb125 100755 --- a/saber/cluster.py +++ b/saber/cluster.py @@ -7,7 +7,7 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from tslearn.clustering import TimeSeriesKMeans +from tslearn.clustering import KShape from tslearn.preprocessing import TimeSeriesScalerMeanVariance from ._vocab import cluster_count_file @@ -17,7 +17,7 @@ def generate(workdir: str) -> None: """ - Trains DTW kmeans model pickle files and plots of the results saved as png images + Trains kmeans clustering models, saves the model as pickle, generates images and supplementary files Args: workdir: path to the project directory @@ -27,51 +27,58 @@ def generate(workdir: str) -> None: """ best_fit = {} - for table in glob.glob(os.path.join(workdir, 'data_processed', '*_fdc_*.parquet.gzip')): + for table in glob.glob(os.path.join(workdir, 'tables', '*_fdc_*.parquet.gzip')): # read the data and transform time_series = pd.read_parquet(table) time_series = np.transpose(time_series.values) time_series = TimeSeriesScalerMeanVariance().fit_transform(time_series) # get the name of the data and start tracking stats - dataset = os.path.splitext(os.path.basename(table))[0] - inertia = {'number': [], 'inertia': []} + dataset = os.path.basename(table).split('.')[0] + inertia = {'number': [], 'inertia': [], 'n_iter': []} - for num_cluster in range(2, 16): + for n_clusters in range(1, 17): + print(n_clusters) # build the kmeans model - km = TimeSeriesKMeans(n_clusters=num_cluster, random_state=0) - km.fit_predict(time_series) - inertia['number'].append(num_cluster) - inertia['inertia'].append(km.inertia_) + ks = KShape(n_clusters=n_clusters, random_state=0, max_iter=150) + ks.fit_predict(time_series) + inertia['number'].append(n_clusters) + inertia['inertia'].append(ks.inertia_) + inertia['n_iter'].append(ks.n_iter_) # save the trained model - km.to_pickle(os.path.join(workdir, 'kmeans_models', f'{dataset}-{num_cluster}-clusters-model.pickle')) + ks.to_pickle(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters-model.pickle')) # generate a plot of the clusters size = time_series.shape[1] - fig = plt.figure(figsize=(30, 15), dpi=450) - fig.suptitle("Dynamic Time Warping Clustering") - assigned_clusters = km.labels_ - for i in range(num_cluster): - plt.subplot(2, math.ceil(num_cluster / 2), i + 1) + # up to 3 cols, rows determined by number of columns + n_cols = min(n_clusters, 3) + n_rows = math.ceil(n_clusters / n_cols) + img_size = 2.5 + fig = plt.figure(figsize=(img_size * n_cols, img_size * n_rows), dpi=750) + + fig.suptitle("KShape Clustering") + assigned_clusters = ks.labels_ + for i in range(n_clusters): + plt.subplot(n_rows, n_cols, i + 1) for j in time_series[assigned_clusters == i]: plt.plot(j.ravel(), "k-", alpha=.2) - plt.plot(km.cluster_centers_[i].ravel(), "r-") + plt.plot(ks.cluster_centers_[i].ravel(), "r-") plt.xlim(0, size) - plt.ylim(-3.5, 3.5) - plt.text(0.55, 0.85, f'Cluster {i}', transform=plt.gca().transAxes) + plt.ylim(-2, 4) + plt.text(0.55, 0.85, f'Cluster {i + 1}', transform=plt.gca().transAxes) plt.tight_layout() - fig.savefig(os.path.join(workdir, 'kmeans_images', f'{dataset}-{num_cluster}-clusters.png')) + fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters.png')) plt.close(fig) # save the inertia results as a csv - pd.DataFrame.from_dict(inertia).to_csv(os.path.join(workdir, 'kmeans_models', f'{dataset}-inertia.csv')) + pd.DataFrame.from_dict(inertia).to_csv(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-inertia.csv')) # find the knee/elbow knee = kneed.KneeLocator(inertia['number'], inertia['inertia'], curve='convex', direction='decreasing').knee best_fit[dataset] = int(knee) # save the best fitting cluster counts to a csv - with open(os.path.join(workdir, 'kmeans_models', cluster_count_file), 'w') as f: + with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'w') as f: f.write(json.dumps(best_fit)) return @@ -102,7 +109,7 @@ def summarize(workdir: str, assign_table: pd.DataFrame) -> pd.DataFrame: # open the optimal model pickle file optimal_model = os.path.join(workdir, 'kmeans_models', f'{dataset}-{cluster_count}-clusters-model.pickle') - sim_labels = TimeSeriesKMeans.from_pickle(optimal_model).labels_.tolist() + sim_labels = KShape.from_pickle(optimal_model).labels_.tolist() # create a dataframe of the ids and their labels (assigned groups) df = pd.DataFrame(np.transpose([sim_labels, ids]), columns=[f'{dataset}-cluster', merge_col]) diff --git a/saber/gis.py b/saber/gis.py index 40a2759..78512f1 100644 --- a/saber/gis.py +++ b/saber/gis.py @@ -28,7 +28,7 @@ def generate_all(workdir: str, assign_table: pd.DataFrame, drain_shape: str, pre workdir: the path to the working directory for the project assign_table: the assign_table dataframe drain_shape: path to a drainage line shapefile which can be clipped - prefix: a prefix for names of the outputs to distinguish between data generated at separate instances + prefix: a prefix for names of the outputs to distinguish between data generated in separate instances id_column: name of the id column in the attributes of the shape table Returns: diff --git a/saber/prep.py b/saber/prep.py index 81478cd..1de954d 100755 --- a/saber/prep.py +++ b/saber/prep.py @@ -1,18 +1,14 @@ import os -import grids import pandas as pd import geopandas as gpd import netCDF4 as nc import numpy as np -from .utils import compute_fdc - from ._vocab import mid_col -from ._vocab import hindcast_table -from ._vocab import hindcast_fdc_table from ._vocab import read_drain_table from ._vocab import guess_hindcast_path +from ._vocab import get_table_path __all__ = ['gis_tables', 'hindcast', 'scaffold_workdir'] @@ -70,14 +66,15 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None: columns=ids[ids_selector].astype(str), index=pd.to_datetime(hnc.variables['time'][:], unit='s') ) + df = df[df.index.year >= 1980] df.index.name = 'datetime' - df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_series_table.parquet.gzip'), compression='gzip') + df.to_parquet(get_table_path(workdir, 'hindcast_series'), compression='gzip') - exceed_prob = np.linspace(0, 100, 401)[::-1] + exceed_prob = np.linspace(0, 100, 201)[::-1] df = df.apply(lambda x: np.transpose(np.nanpercentile(x, exceed_prob))) df.index = exceed_prob df.index.name = 'exceed_prob' - df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_table.parquet.gzip'), compression='gzip') + df.to_parquet(get_table_path(workdir, 'hindcast_fdc'), compression='gzip') return diff --git a/setup.py b/setup.py index 4949ea8..e04a608 100644 --- a/setup.py +++ b/setup.py @@ -32,4 +32,4 @@ ], python_requires=">=3", install_requires=install_requires -) \ No newline at end of file +) From 98c0e18e8d38ba5e423463973ea86af9084dea1c Mon Sep 17 00:00:00 2001 From: rileyhales Date: Thu, 18 Aug 2022 14:57:26 -0600 Subject: [PATCH 04/24] increment version number --- saber/__init__.py | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/saber/__init__.py b/saber/__init__.py index 3024c69..0303a83 100644 --- a/saber/__init__.py +++ b/saber/__init__.py @@ -17,5 +17,5 @@ 'calibrate', 'calibrate_region', 'table', 'prep', 'assign', 'gis', 'cluster', 'utils', 'validate', 'analysis'] __author__ = 'Riley Hales' -__version__ = '0.3.0' +__version__ = '0.4.0' __license__ = 'BSD 3 Clause Clear' diff --git a/setup.py b/setup.py index e04a608..6ef3807 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ install_requires = req.read().splitlines() description = 'tools for hydrological bias correction on large models' -version = '0.3.0' +version = '0.4.0' setup( name='saber', From b3d028a56d010f9451b015db6cf3dfe24066284a Mon Sep 17 00:00:00 2001 From: rileyhales Date: Thu, 18 Aug 2022 15:00:12 -0600 Subject: [PATCH 05/24] rename distribution saberbc --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6ef3807..546cf3b 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ version = '0.4.0' setup( - name='saber', + name='saberbc', packages=['saber'], version=version, description=description, From 61c41f3fd2abcf9805018b156d0b9ab11f4c122a Mon Sep 17 00:00:00 2001 From: rileyhales Date: Sat, 20 Aug 2022 21:20:04 -0600 Subject: [PATCH 06/24] partial commit --- README.md | 76 +++++++++++++---------------- requirements.txt | 19 ++++---- saber/cluster.py | 122 +++++++++++++++++++++++------------------------ saber/prep.py | 20 ++++++-- setup.py | 2 +- 5 files changed, 119 insertions(+), 120 deletions(-) diff --git a/README.md b/README.md index dfde9bd..9951d5f 100755 --- a/README.md +++ b/README.md @@ -1,69 +1,57 @@ # Stream Analysis for Bias Estimation and Reduction ## Theory -Basins and streams will be used interchangeably to refer to the specific stream subunit. - -Large scale hydrologic models are generally not strongly physically based because of the extreme amounts of input data, -computing power, and computing time that would be required to run it. The model uses the results of other physically based -models. This is advantageous to us because we rely on the land surface model's behavior of mapping basins with similar -hydrologically important geophysical physical characteristics to similar discharge forecasts. - -This method relies on spatial analysis, machine learning, and statistical refinements. This is a post processing step which -makes it applicable to many models when the source data, code, or other inputs make the model inaccessible or impractical -to tweak. We do this in an attempt to avoid requiring the source model data or the ability to run the model. -Both of those conditions are not always available or practical when dealing with large scale models or datasets. +Large scale hydrological models are often biased despite the best calibration efforts. Bias correction is a pre- or +post-processing step applied to the model's inputs or outputs to empirically alter datasets and improve model performance. +SABER is a bias correction post processor for discharge predictions. SABER builds on frequency matching ideas used in several +other methods that match biased predictions with observed data with corresponding exceedance probabilities; that is, +using the flow duration curve. SABER expands this approach to ungauged stream reaches using spatial analysis, clustering +(machine learning), and statistical refinements and a new concept- the scalar flow duration curve. ## Python environment See requirements.txt -- python >= 3.7 -- numpy -- pandas -- geopandas (https://github.com/geopandas/geopandas) -- scikit-learn -- tslearn (https://github.com/tslearn-team/tslearn) ## Required Data/Inputs -You need the following data to follow this procedure. The instructions list Shapefile but other vector spatial data -file formats are acceptable -1. Shapefiles for the Drainage Lines (lines) and Catchments (polygons) in the watershed as used by the hydrological model. - The attribute table should contain (at least) the following entries for each feature: - - (Both) An identifier number - - (Both) The ID of the next segment/catchment downstream - - (Drainage lines, preferably both) The stream order of each segment - - (Catchments, preferably both) Cumulative upstream drainage area - - (Drainage lines) the x coordinate of the centroid of each feature - - (Drainage lines) the y coordinate of the centroid of each feature -2. Shapefile of points representing the location of each of the river gauging stations available. - The attribute table should contain (at least) the following entries for each point - - A unique ID number or name assigned to the gauge, preferably numeric. Randomly generate unique numeric ID's if they don't exist. +You need the following data to follow this procedure. Geopackage format GIS data may be substituted with Shapefile or +equivalent formats that can be read by `geopandas`. +1. Geopackage drainage Lines (usually delineated center lines) and catchments/subbasins (polygons) in the watershed. The + attribute tables for both should contain (at least) the following entries for each feature: + - An identifier column (alphanumeric) labeled `model_id`. + - The ID of the next downstream reach/subbasin + - The stream order of each reach/subbasin + - Cumulative upstream drainage area + - The x coordinate of the centroid of each feature (precalculated for faster results later) + - The y coordinate of the centroid of each feature (precalculated for faster results later) +2. Geopackage points representing the location of each of the river gauging stations available. The attribute table + should contain (at least) the following entries for each point + - A unique ID number or name assigned to the gauge, preferably short alphanumeric. You can randomly generate them if convenient - The ID of the stream segment in the model which corresponds to that gauge. -3. Shapefile of the boundary (polygon) of the target region for bias calibration. -4. The Shapefiles for the Drainage Lines, Catchments, Gauges/Stations, and boundary are all in the same projection. -5. Historical simulated discharge for each stream segment and for as long (temporally) as is available. -6. Observed discharge data for as many stream reaches as possible within the target region. -7. The units of the simulation and observation data must be in the same units. -8. A working directory on the computer where the scripts are going to be run. +3. Hindcast/historical simulation discharge for each stream reach +4. Observed discharge data for each gauge + +Things to check when preparing your data +1. The units of the simulated and observed data are in the same units +2. The GIS datasets are all in the same projection. +3. The GIS datasets should only contain gauges and reaches/subbasins within the area of interest. Clip/delete anything + else. You might find it helpful to keep a watershed boundary geopackage. ## Process ### 1 Create a Working Directory ```python -import saber as saber +import saber -path_to_working_directory = '/my/file/path' -saber.prep.scaffold_workdir(path_to_working_directory) +workdir = '/my/file/path' +saber.prep.scaffold_workdir(workdir) ``` Your working directory should exactly like this. ``` working_directory - kmeans_models/ - kmeans_images/ data_inputs/ - data_processed/ - gis_inputs/ + tables/ + kmeans_outputs/ gis_outputs/ - validation_sets/ ``` ### 2 Prepare Spatial Data (scripts not provided) diff --git a/requirements.txt b/requirements.txt index 4c71968..b833ca8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,14 @@ -pandas +python=3 geopandas +pandas numpy -xarray +numba +contextily netCDF4 tslearn +pyarrow +kneed +tslearn matplotlib -geoglows -grids requests -plotly -h5py -scipy -hydrostats -kneed -hydrostats -matplotlib \ No newline at end of file +scipy \ No newline at end of file diff --git a/saber/cluster.py b/saber/cluster.py index 2bfb125..0836748 100755 --- a/saber/cluster.py +++ b/saber/cluster.py @@ -8,7 +8,6 @@ import numpy as np import pandas as pd from tslearn.clustering import KShape -from tslearn.preprocessing import TimeSeriesScalerMeanVariance from ._vocab import cluster_count_file from ._vocab import mid_col @@ -25,82 +24,83 @@ def generate(workdir: str) -> None: Returns: None """ - best_fit = {} - - for table in glob.glob(os.path.join(workdir, 'tables', '*_fdc_*.parquet.gzip')): - # read the data and transform - time_series = pd.read_parquet(table) - time_series = np.transpose(time_series.values) - time_series = TimeSeriesScalerMeanVariance().fit_transform(time_series) - - # get the name of the data and start tracking stats - dataset = os.path.basename(table).split('.')[0] - inertia = {'number': [], 'inertia': [], 'n_iter': []} - - for n_clusters in range(1, 17): - print(n_clusters) - # build the kmeans model - ks = KShape(n_clusters=n_clusters, random_state=0, max_iter=150) - ks.fit_predict(time_series) - inertia['number'].append(n_clusters) - inertia['inertia'].append(ks.inertia_) - inertia['n_iter'].append(ks.n_iter_) - - # save the trained model - ks.to_pickle(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters-model.pickle')) - - # generate a plot of the clusters - size = time_series.shape[1] - # up to 3 cols, rows determined by number of columns - n_cols = min(n_clusters, 3) - n_rows = math.ceil(n_clusters / n_cols) - img_size = 2.5 - fig = plt.figure(figsize=(img_size * n_cols, img_size * n_rows), dpi=750) - - fig.suptitle("KShape Clustering") - assigned_clusters = ks.labels_ - for i in range(n_clusters): - plt.subplot(n_rows, n_cols, i + 1) - for j in time_series[assigned_clusters == i]: - plt.plot(j.ravel(), "k-", alpha=.2) - plt.plot(ks.cluster_centers_[i].ravel(), "r-") - plt.xlim(0, size) - plt.ylim(-2, 4) - plt.text(0.55, 0.85, f'Cluster {i + 1}', transform=plt.gca().transAxes) - plt.tight_layout() - fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters.png')) - plt.close(fig) - - # save the inertia results as a csv - pd.DataFrame.from_dict(inertia).to_csv(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-inertia.csv')) - # find the knee/elbow - knee = kneed.KneeLocator(inertia['number'], inertia['inertia'], curve='convex', direction='decreasing').knee - best_fit[dataset] = int(knee) - - # save the best fitting cluster counts to a csv - with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'w') as f: - f.write(json.dumps(best_fit)) + labels = [] + inertia = {'number': [], 'inertia': [], 'n_iter': []} + + # read the prepared data (array x) + x = pd.read_parquet(os.path.join(workdir, 'tables', '')) + + for n_clusters in range(1, 17): + print(n_clusters) + # build the kmeans model + ks = KShape(n_clusters=n_clusters, random_state=0, max_iter=150) + ks.fit_predict(x) + inertia['number'].append(n_clusters) + inertia['inertia'].append(ks.inertia_) + inertia['n_iter'].append(ks.n_iter_) + labels.append(ks.labels_) + + # save the inertia results as a csv + pd.DataFrame.from_dict(inertia).to_parquet( + os.path.join(workdir, 'kmeans_outputs', f'{dataset}-inertia.parquet.gzip'), compression='gzip') + + df = pd.DataFrame(labels, columns=pd.read_parquet( + os.path.join(workdir, 'tables', 'model_ids.parquet.gzip')).values.flatten()) + df['count'] = df.nunique(axis=1) + df = df.sort_values('count') + df = df.set_index('count') + df.to_parquet(os.path.join(workdir, 'kmeans_outputs', 'cluster_num_summary.parquet.gzip')) + + # # find the knee/elbow + # knee = kneed.KneeLocator(inertia['number'], inertia['inertia'], curve='convex', direction='decreasing').knee + # best_fit[dataset] = int(knee) + + # # save the best fitting cluster counts to a csv + # with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'w') as f: + # f.write(json.dumps(best_fit)) return -def summarize(workdir: str, assign_table: pd.DataFrame) -> pd.DataFrame: +def plot(workdir: str): + # generate a plot of the clusters + size = time_series.shape[1] + # up to 3 cols, rows determined by number of columns + n_cols = min(n_clusters, 3) + n_rows = math.ceil(n_clusters / n_cols) + img_size = 2.5 + fig = plt.figure(figsize=(img_size * n_cols, img_size * n_rows), dpi=750) + + fig.suptitle("KShape Clustering") + assigned_clusters = ks.labels_ + for i in range(n_clusters): + plt.subplot(n_rows, n_cols, i + 1) + for j in time_series[assigned_clusters == i]: + plt.plot(j.ravel(), "k-", alpha=.2) + plt.plot(ks.cluster_centers_[i].ravel(), "r-") + plt.xlim(0, size) + plt.ylim(-2, 4) + plt.text(0.55, 0.85, f'Cluster {i + 1}', transform=plt.gca().transAxes) + plt.tight_layout() + fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters.png')) + plt.close(fig) + return + + +def summarize(workdir: str) -> pd.DataFrame: """ Creates a csv listing the streams assigned to each cluster in workdir/kmeans_models and also adds that information to assign_table.csv Args: workdir: path to the project directory - assign_table: the assign_table dataframe Returns: None """ # read the cluster results csv - with open(os.path.join(workdir, 'kmeans_models', cluster_count_file), 'r') as f: + with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'r') as f: clusters = json.loads(f.read()) - assign_table[mid_col] = assign_table[mid_col].astype(int) - for dataset, cluster_count in clusters.items(): # read the list of simulated id's, pair them with their cluster label, save to df merge_col = mid_col if "sim" in dataset else gid_col diff --git a/saber/prep.py b/saber/prep.py index 1de954d..56df8d7 100755 --- a/saber/prep.py +++ b/saber/prep.py @@ -4,6 +4,7 @@ import geopandas as gpd import netCDF4 as nc import numpy as np +from tslearn.preprocessing import TimeSeriesScalerMeanVariance from ._vocab import mid_col from ._vocab import read_drain_table @@ -27,7 +28,6 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No """ if gauge_gis is not None: gdf = gpd.read_file(gauge_gis).drop('geometry', axis=1) - gdf[['x', 'y']] = gdf.geometry.apply(lambda x: [x.x, x.y]) pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet( os.path.join(workdir, 'tables', 'gauge_table.parquet.gzip'), compression='gzip') if drain_gis is not None: @@ -57,24 +57,38 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None: drain_table = read_drain_table(workdir) model_ids = list(set(sorted(drain_table[mid_col].tolist()))) - # read the hindcast netcdf and convert to dataframe + # read the hindcast netcdf, convert to dataframe, store as parquet hnc = nc.Dataset(hind_nc_path) ids = pd.Series(hnc['rivid'][:]) ids_selector = ids.isin(model_ids) + ids = ids[ids_selector].astype(str).values.flatten() + + # save the model ids to table for reference + pd.DataFrame(ids, columns=['model_id', ]).to_parquet( + os.path.join(workdir, 'tables', 'model_ids.parquet.gzip'), compression='gzip') + + # save the hindcast series to parquet df = pd.DataFrame( hnc['Qout'][:, ids_selector], - columns=ids[ids_selector].astype(str), + columns=ids, index=pd.to_datetime(hnc.variables['time'][:], unit='s') ) df = df[df.index.year >= 1980] df.index.name = 'datetime' df.to_parquet(get_table_path(workdir, 'hindcast_series'), compression='gzip') + # calculate the FDC and save to parquet exceed_prob = np.linspace(0, 100, 201)[::-1] df = df.apply(lambda x: np.transpose(np.nanpercentile(x, exceed_prob))) df.index = exceed_prob df.index.name = 'exceed_prob' df.to_parquet(get_table_path(workdir, 'hindcast_fdc'), compression='gzip') + + # transform and prepare for clustering + df = pd.DataFrame(TimeSeriesScalerMeanVariance().fit_transform(np.transpose(df.values))) + df.index = ids + df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet.gzip'), compression='gzip') + return diff --git a/setup.py b/setup.py index 546cf3b..6ef3807 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ version = '0.4.0' setup( - name='saberbc', + name='saber', packages=['saber'], version=version, description=description, From f62ee080dac1e10e61dd89bdface98d8044b9d41 Mon Sep 17 00:00:00 2001 From: Riley Hales Date: Sat, 20 Aug 2022 22:09:56 -0600 Subject: [PATCH 07/24] split clustering functions --- saber/cluster.py | 88 +++++++++++++++++++++--------------------------- saber/prep.py | 2 +- 2 files changed, 40 insertions(+), 50 deletions(-) diff --git a/saber/cluster.py b/saber/cluster.py index 0836748..4827c8b 100755 --- a/saber/cluster.py +++ b/saber/cluster.py @@ -1,4 +1,3 @@ -import glob import math import os import json @@ -7,7 +6,7 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from tslearn.clustering import KShape +from tslearn.clustering import TimeSeriesKMeans from ._vocab import cluster_count_file from ._vocab import mid_col @@ -24,65 +23,29 @@ def generate(workdir: str) -> None: Returns: None """ - labels = [] inertia = {'number': [], 'inertia': [], 'n_iter': []} # read the prepared data (array x) - x = pd.read_parquet(os.path.join(workdir, 'tables', '')) + x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet.gzip')) + # build the kmeans model for a range of cluster numbers for n_clusters in range(1, 17): - print(n_clusters) - # build the kmeans model - ks = KShape(n_clusters=n_clusters, random_state=0, max_iter=150) + ks = TimeSeriesKMeans(n_clusters=n_clusters, max_iter=150) ks.fit_predict(x) + ks.to_pickle(os.path.join(workdir, 'kmeans_outputs', f'kshape-{n_clusters}.pickle')) inertia['number'].append(n_clusters) inertia['inertia'].append(ks.inertia_) inertia['n_iter'].append(ks.n_iter_) - labels.append(ks.labels_) # save the inertia results as a csv - pd.DataFrame.from_dict(inertia).to_parquet( - os.path.join(workdir, 'kmeans_outputs', f'{dataset}-inertia.parquet.gzip'), compression='gzip') - - df = pd.DataFrame(labels, columns=pd.read_parquet( - os.path.join(workdir, 'tables', 'model_ids.parquet.gzip')).values.flatten()) - df['count'] = df.nunique(axis=1) - df = df.sort_values('count') - df = df.set_index('count') - df.to_parquet(os.path.join(workdir, 'kmeans_outputs', 'cluster_num_summary.parquet.gzip')) - - # # find the knee/elbow - # knee = kneed.KneeLocator(inertia['number'], inertia['inertia'], curve='convex', direction='decreasing').knee - # best_fit[dataset] = int(knee) - - # # save the best fitting cluster counts to a csv - # with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'w') as f: - # f.write(json.dumps(best_fit)) - return - + pd.DataFrame.from_dict(inertia).to_parquet(os.path.join(workdir, 'kmeans_outputs', f'cluster-inertia.csv')) -def plot(workdir: str): - # generate a plot of the clusters - size = time_series.shape[1] - # up to 3 cols, rows determined by number of columns - n_cols = min(n_clusters, 3) - n_rows = math.ceil(n_clusters / n_cols) - img_size = 2.5 - fig = plt.figure(figsize=(img_size * n_cols, img_size * n_rows), dpi=750) + # find the knee/elbow + knee = kneed.KneeLocator(inertia['number'], inertia['inertia'], curve='convex', direction='decreasing').knee - fig.suptitle("KShape Clustering") - assigned_clusters = ks.labels_ - for i in range(n_clusters): - plt.subplot(n_rows, n_cols, i + 1) - for j in time_series[assigned_clusters == i]: - plt.plot(j.ravel(), "k-", alpha=.2) - plt.plot(ks.cluster_centers_[i].ravel(), "r-") - plt.xlim(0, size) - plt.ylim(-2, 4) - plt.text(0.55, 0.85, f'Cluster {i + 1}', transform=plt.gca().transAxes) - plt.tight_layout() - fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters.png')) - plt.close(fig) + # save the best fitting cluster counts to a csv + with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'w') as f: + f.write(json.dumps({'historical': int(knee)})) return @@ -109,7 +72,7 @@ def summarize(workdir: str) -> pd.DataFrame: # open the optimal model pickle file optimal_model = os.path.join(workdir, 'kmeans_models', f'{dataset}-{cluster_count}-clusters-model.pickle') - sim_labels = KShape.from_pickle(optimal_model).labels_.tolist() + sim_labels = TimeSeriesKMeans.from_pickle(optimal_model).labels_.tolist() # create a dataframe of the ids and their labels (assigned groups) df = pd.DataFrame(np.transpose([sim_labels, ids]), columns=[f'{dataset}-cluster', merge_col]) @@ -120,3 +83,30 @@ def summarize(workdir: str) -> pd.DataFrame: assign_table = assign_table.merge(df, how='outer', on=merge_col) return assign_table + + +def plot(workdir: str): + # read the fdc's + x = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs')) + # generate a plot of the clusters + size = time_series.shape[1] + # up to 3 cols, rows determined by number of columns + n_cols = min(n_clusters, 3) + n_rows = math.ceil(n_clusters / n_cols) + img_size = 2.5 + fig = plt.figure(figsize=(img_size * n_cols, img_size * n_rows), dpi=750) + + fig.suptitle("TimeSeriesKMeans Clustering") + assigned_clusters = ks.labels_ + for i in range(n_clusters): + plt.subplot(n_rows, n_cols, i + 1) + for j in time_series[assigned_clusters == i]: + plt.plot(j.ravel(), "k-", alpha=.2) + plt.plot(ks.cluster_centers_[i].ravel(), "r-") + plt.xlim(0, size) + plt.ylim(-2, 4) + plt.text(0.55, 0.85, f'Cluster {i + 1}', transform=plt.gca().transAxes) + plt.tight_layout() + fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters.png')) + plt.close(fig) + return \ No newline at end of file diff --git a/saber/prep.py b/saber/prep.py index 56df8d7..04882c5 100755 --- a/saber/prep.py +++ b/saber/prep.py @@ -32,7 +32,7 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No os.path.join(workdir, 'tables', 'gauge_table.parquet.gzip'), compression='gzip') if drain_gis is not None: gdf = gpd.read_file(drain_gis).drop('geometry', axis=1) - gdf[['x', 'y']] = gdf.geometry.apply(lambda x: [x.x, x.y]) + gdf[['centroid_x', 'centroid_y']] = gdf.geometry.apply(lambda x: [x.x, x.y]) pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet( os.path.join(workdir, 'tables', 'drain_table.parquet.gzip'), compression='gzip') return From ac3392fc869b701b009dd1c063499bb00672942b Mon Sep 17 00:00:00 2001 From: rileyhales Date: Sun, 21 Aug 2022 15:40:04 -0600 Subject: [PATCH 08/24] plot cluster figures in separate function --- conda-environment.yml | 1 + requirements.txt | 3 +- saber/_vocab.py | 8 +-- saber/cluster.py | 110 +++++++++++++++++++++++++++--------------- saber/prep.py | 24 ++++----- 5 files changed, 91 insertions(+), 55 deletions(-) diff --git a/conda-environment.yml b/conda-environment.yml index ec661ed..2a9783c 100644 --- a/conda-environment.yml +++ b/conda-environment.yml @@ -18,3 +18,4 @@ dependencies: - geoglows - requests - scipy + - natsort diff --git a/requirements.txt b/requirements.txt index b833ca8..99ea19d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ kneed tslearn matplotlib requests -scipy \ No newline at end of file +scipy +natsort \ No newline at end of file diff --git a/saber/_vocab.py b/saber/_vocab.py index 44c4f3c..923c3f1 100644 --- a/saber/_vocab.py +++ b/saber/_vocab.py @@ -22,10 +22,10 @@ # name of some files produced by the algorithm cluster_count_file = 'best-fit-cluster-count.json' cal_nc_name = 'calibrated_simulated_flow.nc' -hindcast_table = 'hindcast_series_table.parquet.gzip' -hindcast_fdc_table = 'hindcast_fdc_table.parquet.gzip' -drain_table = 'drain_table.parquet.gzip' -gauge_table = 'gauge_table.parquet.gzip' +hindcast_table = 'hindcast_series_table.parquet' +hindcast_fdc_table = 'hindcast_fdc_table.parquet' +drain_table = 'drain_table.parquet' +gauge_table = 'gauge_table.parquet' # metrics computed on validation sets metric_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE (2012)'] diff --git a/saber/cluster.py b/saber/cluster.py index 4827c8b..a3e1582 100755 --- a/saber/cluster.py +++ b/saber/cluster.py @@ -1,12 +1,14 @@ import math import os import json +import glob import kneed import matplotlib.pyplot as plt import numpy as np import pandas as pd -from tslearn.clustering import TimeSeriesKMeans +from tslearn.clustering import TimeSeriesKMeans as Cluster +from natsort import natsorted from ._vocab import cluster_count_file from ._vocab import mid_col @@ -26,19 +28,21 @@ def generate(workdir: str) -> None: inertia = {'number': [], 'inertia': [], 'n_iter': []} # read the prepared data (array x) - x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet.gzip')) + x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')) + x = x.values # build the kmeans model for a range of cluster numbers for n_clusters in range(1, 17): - ks = TimeSeriesKMeans(n_clusters=n_clusters, max_iter=150) + print(n_clusters) + ks = Cluster(n_clusters=n_clusters, max_iter=150) ks.fit_predict(x) - ks.to_pickle(os.path.join(workdir, 'kmeans_outputs', f'kshape-{n_clusters}.pickle')) + ks.to_pickle(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')) inertia['number'].append(n_clusters) inertia['inertia'].append(ks.inertia_) inertia['n_iter'].append(ks.n_iter_) # save the inertia results as a csv - pd.DataFrame.from_dict(inertia).to_parquet(os.path.join(workdir, 'kmeans_outputs', f'cluster-inertia.csv')) + pd.DataFrame.from_dict(inertia).to_csv(os.path.join(workdir, 'kmeans_outputs', f'cluster-inertia.csv')) # find the knee/elbow knee = kneed.KneeLocator(inertia['number'], inertia['inertia'], curve='convex', direction='decreasing').knee @@ -49,21 +53,78 @@ def generate(workdir: str) -> None: return -def summarize(workdir: str) -> pd.DataFrame: +def plot(workdir: str) -> None: + """ + Generate figures of the clustered FDC's + + Args: + workdir: path to the project directory + + Returns: + None + """ + # image generating params + img_width = 3 + img_height = 3 + max_cols = 3 + + # read the fdc's + x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')).values + size = x.shape[1] + x_values = np.linspace(0, size, 5) + x_ticks = np.linspace(0, 100, 5).astype(int) + + for model_pickle in natsorted(glob.glob(os.path.join(workdir, 'kmeans_outputs', 'kmeans-*.pickle'))): + kmeans = Cluster.from_pickle(model_pickle) + n_clusters = int(kmeans.n_clusters) + n_cols = min(n_clusters, max_cols) + n_rows = math.ceil(n_clusters / n_cols) + + fig, axs = plt.subplots(n_rows, n_cols, figsize=(img_width * n_cols + 1, img_height * n_rows + 1), dpi=800, + squeeze=False, tight_layout=True, sharey=True) + fig.suptitle("KMeans FDC Clustering") + fig.supxlabel('Exceedance Probability (%)') + fig.supylabel('Discharge Z-Score') + + for i, ax in enumerate(fig.axes[:n_clusters]): + ax.set_title(f'Cluster {i + 1}') + ax.set_xlim(0, size) + ax.set_xticks(x_values, x_ticks) + ax.set_ylim(-2, 4) + for j in x[kmeans.labels_ == i]: + ax.plot(j.ravel(), "k-", alpha=.15) + ax.plot(kmeans.cluster_centers_[i].flatten(), "r-") + # turn off plotting axes which are blank - made for the square grid but > n_clusters + for ax in fig.axes[n_clusters:]: + ax.axis('off') + + fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.png')) + plt.close(fig) + return + + +def summarize(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) -> pd.DataFrame: """ Creates a csv listing the streams assigned to each cluster in workdir/kmeans_models and also adds that information to assign_table.csv Args: workdir: path to the project directory + assign_table: the assignment table DataFrame + n_clusters: number of clusters to use when applying the labels to the assign_table Returns: None """ - # read the cluster results csv - with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'r') as f: - clusters = json.loads(f.read()) - + if n_clusters is None: + # read the cluster results csv + with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'r') as f: + n_clusters = int(json.loads(f.read())['historical']) + + pd.DataFrame({ + 'cluster': Cluster.from_pickle(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_.flatten(), + 'model_id': pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten() + }) for dataset, cluster_count in clusters.items(): # read the list of simulated id's, pair them with their cluster label, save to df merge_col = mid_col if "sim" in dataset else gid_col @@ -72,7 +133,7 @@ def summarize(workdir: str) -> pd.DataFrame: # open the optimal model pickle file optimal_model = os.path.join(workdir, 'kmeans_models', f'{dataset}-{cluster_count}-clusters-model.pickle') - sim_labels = TimeSeriesKMeans.from_pickle(optimal_model).labels_.tolist() + sim_labels = Cluster.from_pickle(optimal_model).labels_.tolist() # create a dataframe of the ids and their labels (assigned groups) df = pd.DataFrame(np.transpose([sim_labels, ids]), columns=[f'{dataset}-cluster', merge_col]) @@ -83,30 +144,3 @@ def summarize(workdir: str) -> pd.DataFrame: assign_table = assign_table.merge(df, how='outer', on=merge_col) return assign_table - - -def plot(workdir: str): - # read the fdc's - x = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs')) - # generate a plot of the clusters - size = time_series.shape[1] - # up to 3 cols, rows determined by number of columns - n_cols = min(n_clusters, 3) - n_rows = math.ceil(n_clusters / n_cols) - img_size = 2.5 - fig = plt.figure(figsize=(img_size * n_cols, img_size * n_rows), dpi=750) - - fig.suptitle("TimeSeriesKMeans Clustering") - assigned_clusters = ks.labels_ - for i in range(n_clusters): - plt.subplot(n_rows, n_cols, i + 1) - for j in time_series[assigned_clusters == i]: - plt.plot(j.ravel(), "k-", alpha=.2) - plt.plot(ks.cluster_centers_[i].ravel(), "r-") - plt.xlim(0, size) - plt.ylim(-2, 4) - plt.text(0.55, 0.85, f'Cluster {i + 1}', transform=plt.gca().transAxes) - plt.tight_layout() - fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'{dataset}-kshape-{n_clusters}_clusters.png')) - plt.close(fig) - return \ No newline at end of file diff --git a/saber/prep.py b/saber/prep.py index 04882c5..0890bd3 100755 --- a/saber/prep.py +++ b/saber/prep.py @@ -29,12 +29,13 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No if gauge_gis is not None: gdf = gpd.read_file(gauge_gis).drop('geometry', axis=1) pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet( - os.path.join(workdir, 'tables', 'gauge_table.parquet.gzip'), compression='gzip') + os.path.join(workdir, 'tables', 'gauge_table.parquet')) if drain_gis is not None: - gdf = gpd.read_file(drain_gis).drop('geometry', axis=1) - gdf[['centroid_x', 'centroid_y']] = gdf.geometry.apply(lambda x: [x.x, x.y]) - pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet( - os.path.join(workdir, 'tables', 'drain_table.parquet.gzip'), compression='gzip') + gdf = gpd.read_file(drain_gis) + gdf['centroid_x'] = gdf.geometry.centroid.x + gdf['centroid_y'] = gdf.geometry.centroid.y + gdf = gdf.drop('geometry', axis=1) + pd.DataFrame(gdf).to_parquet(os.path.join(workdir, 'tables', 'drain_table.parquet')) return @@ -64,8 +65,7 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None: ids = ids[ids_selector].astype(str).values.flatten() # save the model ids to table for reference - pd.DataFrame(ids, columns=['model_id', ]).to_parquet( - os.path.join(workdir, 'tables', 'model_ids.parquet.gzip'), compression='gzip') + pd.DataFrame(ids, columns=['model_id', ]).to_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')) # save the hindcast series to parquet df = pd.DataFrame( @@ -75,20 +75,20 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None: ) df = df[df.index.year >= 1980] df.index.name = 'datetime' - df.to_parquet(get_table_path(workdir, 'hindcast_series'), compression='gzip') + df.to_parquet(get_table_path(workdir, 'hindcast_series')) # calculate the FDC and save to parquet exceed_prob = np.linspace(0, 100, 201)[::-1] df = df.apply(lambda x: np.transpose(np.nanpercentile(x, exceed_prob))) df.index = exceed_prob df.index.name = 'exceed_prob' - df.to_parquet(get_table_path(workdir, 'hindcast_fdc'), compression='gzip') + df.to_parquet(get_table_path(workdir, 'hindcast_fdc')) # transform and prepare for clustering - df = pd.DataFrame(TimeSeriesScalerMeanVariance().fit_transform(np.transpose(df.values))) + df = pd.DataFrame(np.squeeze(TimeSeriesScalerMeanVariance().fit_transform(np.squeeze(np.transpose(df.values))))) df.index = ids - df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet.gzip'), compression='gzip') - + df.columns = df.columns.astype(str) + df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')) return From 8b60508a2534f94c5a78bda9c753befe0ab6e7af Mon Sep 17 00:00:00 2001 From: rileyhales Date: Tue, 23 Aug 2022 20:39:23 -0600 Subject: [PATCH 09/24] remove gauge clustering --- saber/__init__.py | 4 +--- saber/_workflow.py | 43 ------------------------------------------- saber/assign.py | 28 ++++------------------------ saber/cluster.py | 30 +++++++----------------------- saber/gis.py | 40 ++++++++++++++-------------------------- 5 files changed, 26 insertions(+), 119 deletions(-) delete mode 100644 saber/_workflow.py diff --git a/saber/__init__.py b/saber/__init__.py index 0303a83..d6a5a09 100644 --- a/saber/__init__.py +++ b/saber/__init__.py @@ -1,4 +1,3 @@ -from saber._workflow import analyze_region from saber._calibrate import calibrate, calibrate_region import saber.table @@ -13,8 +12,7 @@ import saber._vocab -__all__ = ['analyze_region', - 'calibrate', 'calibrate_region', +__all__ = ['calibrate', 'calibrate_region', 'table', 'prep', 'assign', 'gis', 'cluster', 'utils', 'validate', 'analysis'] __author__ = 'Riley Hales' __version__ = '0.4.0' diff --git a/saber/_workflow.py b/saber/_workflow.py deleted file mode 100644 index 494b693..0000000 --- a/saber/_workflow.py +++ /dev/null @@ -1,43 +0,0 @@ -import pandas as pd - -from .prep import hindcast - -from .table import gen, cache -from .cluster import generate, summarize -from .assign import gauged, propagation, clusters_by_dist -from .gis import clip_by_assignment, clip_by_cluster, clip_by_unassigned -from ._calibrate import calibrate_region - - -def analyze_region(workdir: str, drain_shape: str, gauge_table: pd.DataFrame = None, obs_data_dir: str = None) -> None: - # Generate the assignments table - print("gen assign_table") - assign_table = gen(workdir) - cache(workdir, assign_table) - - # Generate the clusters using the historical simulation data - print("working on clustering") - generate(workdir) - assign_table = summarize(workdir, assign_table) - cache(workdir, assign_table) - - # Assign basins which are gauged and propagate those gauges - print("make assignments") - assign_table = gauged(assign_table) - assign_table = propagation(assign_table) - assign_table = clusters_by_dist(assign_table) - - # Cache the assignments table with the updates - print("cache table results") - cache(workdir, assign_table) - - # Generate GIS files so you can go explore your progress graphically - print("generate gis files") - clip_by_assignment(workdir, assign_table, drain_shape) - clip_by_cluster(workdir, assign_table, drain_shape) - clip_by_unassigned(workdir, assign_table, drain_shape) - - # Compute the corrected simulation data - print("calibrating region netcdf") - calibrate_region(workdir, assign_table, gauge_table, obs_data_dir) - return diff --git a/saber/assign.py b/saber/assign.py index 54428b7..2f29353 100644 --- a/saber/assign.py +++ b/saber/assign.py @@ -24,9 +24,10 @@ def gauged(df: pd.DataFrame) -> pd.DataFrame: Copy of df with assignments made """ _df = df.copy() - _df.loc[~_df[gid_col].isna(), asgn_mid_col] = _df[mid_col] - _df.loc[~_df[gid_col].isna(), asgn_gid_col] = _df[gid_col] - _df.loc[~_df[gid_col].isna(), reason_col] = 'gauged' + selector = ~_df[gid_col].isna() + _df.loc[selector, asgn_mid_col] = _df[mid_col] + _df.loc[selector, asgn_gid_col] = _df[gid_col] + _df.loc[selector, reason_col] = 'gauged' return _df @@ -54,26 +55,6 @@ def propagation(df: pd.DataFrame, max_prop: int = 5) -> pd.DataFrame: return _df -def clusters_by_monavg(df: pd.DataFrame) -> pd.DataFrame: - """ - Assigns all possible ungauged basins a gauge that is - (1) of the same stream order - (2) in the same simulated fdc cluster - (3) in the same simulated monavg cluster (monthly averages) - (4) closest in total drainage area - - This requires matching 2 clusters. Basins in both of the same groups have high likelihood of behaving similarly. - - Args: - df: the assignments table dataframe - - Returns: - Copy of df with assignments made - """ - _df = df.copy() - return _df - - def clusters_by_dist(df: pd.DataFrame) -> pd.DataFrame: """ Assigns all possible ungauged basins a gauge that is @@ -87,7 +68,6 @@ def clusters_by_dist(df: pd.DataFrame) -> pd.DataFrame: Returns: Copy of df with assignments made """ - # todo assign based on closest upstream drainage area?? _df = df.copy() # first filter by cluster number for c_num in sorted(set(_df['sim-fdc-cluster'].values)): diff --git a/saber/cluster.py b/saber/cluster.py index a3e1582..a57ecc8 100755 --- a/saber/cluster.py +++ b/saber/cluster.py @@ -12,7 +12,6 @@ from ._vocab import cluster_count_file from ._vocab import mid_col -from ._vocab import gid_col def generate(workdir: str) -> None: @@ -121,26 +120,11 @@ def summarize(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'r') as f: n_clusters = int(json.loads(f.read())['historical']) - pd.DataFrame({ + # create a dataframe with the optimal model's labels and the model_id's + df = pd.DataFrame({ 'cluster': Cluster.from_pickle(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_.flatten(), - 'model_id': pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten() - }) - for dataset, cluster_count in clusters.items(): - # read the list of simulated id's, pair them with their cluster label, save to df - merge_col = mid_col if "sim" in dataset else gid_col - csv_path = os.path.join(workdir, f'data_processed', f'{dataset}.csv') - ids = pd.read_csv(csv_path, index_col=0).columns - - # open the optimal model pickle file - optimal_model = os.path.join(workdir, 'kmeans_models', f'{dataset}-{cluster_count}-clusters-model.pickle') - sim_labels = Cluster.from_pickle(optimal_model).labels_.tolist() - - # create a dataframe of the ids and their labels (assigned groups) - df = pd.DataFrame(np.transpose([sim_labels, ids]), columns=[f'{dataset}-cluster', merge_col]) - df.to_csv(os.path.join(workdir, 'kmeans_models', f'optimal-assigns-{dataset}.csv'), index=False) - - # merge the dataframes - df[merge_col] = df[merge_col].astype(int) - assign_table = assign_table.merge(df, how='outer', on=merge_col) - - return assign_table + mid_col: pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten() + }, dtype=str) + + # merge the dataframes + return assign_table.merge(df, how='outer', on=mid_col) diff --git a/saber/gis.py b/saber/gis.py index 78512f1..2328171 100644 --- a/saber/gis.py +++ b/saber/gis.py @@ -18,8 +18,7 @@ 'validation_maps'] -def generate_all(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '', - id_column: str = mid_col) -> None: +def generate_all(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None: """ Runs all the clip functions which create subsets of the drainage lines GIS dataset based on how they were assigned for bias correction. @@ -29,19 +28,17 @@ def generate_all(workdir: str, assign_table: pd.DataFrame, drain_shape: str, pre assign_table: the assign_table dataframe drain_shape: path to a drainage line shapefile which can be clipped prefix: a prefix for names of the outputs to distinguish between data generated in separate instances - id_column: name of the id column in the attributes of the shape table Returns: None """ - clip_by_assignment(workdir, assign_table, drain_shape, prefix, id_column) - clip_by_cluster(workdir, assign_table, drain_shape, prefix, id_column) - clip_by_unassigned(workdir, assign_table, drain_shape, prefix, id_column) + clip_by_assignment(workdir, assign_table, drain_shape, prefix) + clip_by_cluster(workdir, assign_table, drain_shape, prefix) + clip_by_unassigned(workdir, assign_table, drain_shape, prefix) return -def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '', - id_column: str = mid_col) -> None: +def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None: """ Creates Geopackage files in workdir/gis_outputs for each unique value in the assignment column @@ -50,7 +47,6 @@ def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: st assign_table: the assign_table dataframe drain_shape: path to a drainage line shapefile which can be clipped prefix: a prefix for names of the outputs to distinguish between data generated at separate instances - id_column: name of the id column in the attributes of the shape table Returns: None @@ -62,7 +58,7 @@ def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: st # get the unique list of assignment reasons for reason in set(assign_table[reason_col].dropna().values): ids = assign_table[assign_table[reason_col] == reason][mid_col].values - subset = dl[dl[id_column].isin(ids)] + subset = dl[dl[mid_col].isin(ids)] name = f'{prefix}{"_" if prefix else ""}assignments_{reason}.gpkg' if subset.empty: continue @@ -71,8 +67,7 @@ def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: st return -def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '', - id_column: str = mid_col) -> None: +def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None: """ Creates Geopackage files in workdir/gis_outputs of the drainage lines based on the fdc cluster they were assigned to @@ -81,26 +76,20 @@ def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_shape: str, assign_table: the assign_table dataframe drain_shape: path to a drainage line shapefile which can be clipped prefix: optional, a prefix to prepend to each created file's name - id_column: name of the id column in the attributes of the shape table Returns: None """ dl_gdf = gpd.read_file(drain_shape) - cluster_types = [a for a in assign_table if 'cluster' in a] - for ctype in cluster_types: - for gnum in sorted(set(assign_table[ctype].dropna().values)): - savepath = os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}{ctype}-{int(gnum)}.gpkg') - ids = assign_table[assign_table[ctype] == gnum][mid_col].values - if dl_gdf[dl_gdf[id_column].isin(ids)].empty: - continue - else: - dl_gdf[dl_gdf[id_column].isin(ids)].to_file(savepath) + for num in sorted(set(assign_table[mid_col].dropna().values)): + gdf = dl_gdf[dl_gdf[mid_col].isin(assign_table[assign_table[mid_col] == num][mid_col])] + if gdf.empty: + continue + gdf.to_file(os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}-{int(num)}.gpkg')) return -def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '', - id_column: str = mid_col) -> None: +def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None: """ Creates Geopackage files in workdir/gis_outputs of the drainage lines which haven't been assigned a gauge yet @@ -109,14 +98,13 @@ def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_shape: st assign_table: the assign_table dataframe drain_shape: path to a drainage line shapefile which can be clipped prefix: optional, a prefix to prepend to each created file's name - id_column: name of the id column in the attributes of the shape table Returns: None """ dl_gdf = gpd.read_file(drain_shape) ids = assign_table[assign_table[reason_col].isna()][mid_col].values - subset = dl_gdf[dl_gdf[id_column].isin(ids)] + subset = dl_gdf[dl_gdf[mid_col].isin(ids)] if subset.empty: warnings.warn('Empty filter: No streams are unassigned') return From a63de91acab4a2560cbd0903407f04a7ab6e0674 Mon Sep 17 00:00:00 2001 From: rileyhales Date: Tue, 23 Aug 2022 20:39:49 -0600 Subject: [PATCH 10/24] delete example scripts --- .../colombia-magdalena/magdalena_example.py | 73 ------------------- examples/example_inputs.py | 16 ---- examples/example_script.py | 70 ------------------ 3 files changed, 159 deletions(-) delete mode 100644 examples/colombia-magdalena/magdalena_example.py delete mode 100644 examples/example_inputs.py delete mode 100644 examples/example_script.py diff --git a/examples/colombia-magdalena/magdalena_example.py b/examples/colombia-magdalena/magdalena_example.py deleted file mode 100644 index 2b0bf39..0000000 --- a/examples/colombia-magdalena/magdalena_example.py +++ /dev/null @@ -1,73 +0,0 @@ -import os - -import numpy as np - -import saber - - -np.seterr(all="ignore") - -workdir = '/Users/rchales/data/regional-bias-correction/colombia-magdalena' -drain_shape = os.path.join(workdir, 'gis_inputs', 'magdalena_dl_attrname_xy.json') -gauge_shape = os.path.join(workdir, 'gis_inputs', 'ideam_stations.json') -obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs') - -# Only need to do this step 1x ever -# saber.prep.scaffold_working_directory(workdir) - -# Create the gauge_table and drain_table.csv -# Scripts not provided, check readme for instructions - -# Generate the assignments table -# assign_table = saber.table.gen(workdir) -# saber.table.cache(workdir, assign_table) -# Or read the existing table -# assign_table = saber.table.read(workdir) - -# Prepare the observation and simulation data -# Only need to do this step 1x ever -# saber.prep.historical_simulation(os.path.join(workdir, 'data_simulated', 'south_america_era5_qout.nc'), workdir) -# saber.prep.observation_data(workdir) - -# Generate the clusters using the historical simulation data -# saber.cluster.generate(workdir) -# assign_table = saber.cluster.summarize(workdir, assign_table) -# saber.table.cache(workdir, assign_table) - -# Assign basins which are gauged and propagate those gauges -# assign_table = saber.assign.gauged(assign_table) -# assign_table = saber.assign.propagation(assign_table) -# assign_table = saber.assign.clusters_by_dist(assign_table) -# todo assign_table = saber.assign.clusters_by_monavg(assign_table) - -# Cache the assignments table with the updates -# saber.table.cache(workdir, assign_table) - -# Generate GIS files so you can go explore your progress graphically -# saber.gis.clip_by_assignment(workdir, assign_table, drain_shape) -# saber.gis.clip_by_cluster(workdir, assign_table, drain_shape) -# saber.gis.clip_by_unassigned(workdir, assign_table, drain_shape) - -# Compute the corrected simulation data -# assign_table = saber.table.read(workdir) -# saber.calibrate_region(workdir, assign_table) -# vtab = saber.validate.gen_val_table(workdir) -saber.gis.validation_maps(workdir, gauge_shape) -saber.analysis.plot(workdir, obs_data_dir, 9007721) - - -# import pandas as pd -# path_to_your_pickle_file = '/Users/rchales/data/regional-bias-correction/colombia-magdalena/validation_runs/90/data_processed/subset_time_series.pickle' -# a = pd.read_pickle(path_to_your_pickle_file) -# a.index = a.index.tz_localize('UTC') -# a.to_pickle(path_to_your_pickle_file) - - -# import netCDF4 as nc -# import numpy as np -# path = '/Users/rchales/data/regional-bias-correction/colombia-magdalena/validation_runs/90/calibrated_simulated_flow.nc' -# a = nc.Dataset(path, 'r+') -# a.createVariable('corrected', 'i4', ('corrected',), zlib=True, shuffle=True, fill_value=-1) -# a['corrected'][:] = np.array((0, 1)) -# a.sync() -# a.close() \ No newline at end of file diff --git a/examples/example_inputs.py b/examples/example_inputs.py deleted file mode 100644 index 4163d24..0000000 --- a/examples/example_inputs.py +++ /dev/null @@ -1,16 +0,0 @@ -import os - - -# COLOMBIA -workdir = '/Users/rchales/data/saber/colombia-magdalena' -drain_shape = os.path.join(workdir, 'gis_inputs', 'magdalena_dl_attrname_xy.json') -gauge_shape = os.path.join(workdir, 'gis_inputs', 'ideam_stations.json') -obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs') -hist_sim_nc = os.path.join(workdir, 'data_inputs', 'south_america_era5_qout.nc') - -# TEXAS -workdir = '/Users/rchales/data/saber/texas' -drain_shape = os.path.join(workdir, 'shapefiles', 'texas-dl.json') -gauge_shape = os.path.join(workdir, 'shapefiles', 'texas-gauges.shp') -obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs') -hist_sim_nc = os.path.join(workdir, 'data_inputs', 'Qout_era5_t640_24hr_19790101to20210630.nc.nc') diff --git a/examples/example_script.py b/examples/example_script.py deleted file mode 100644 index b114050..0000000 --- a/examples/example_script.py +++ /dev/null @@ -1,70 +0,0 @@ -import os - -import numpy as np - -import saber - - -np.seterr(all="ignore") - -# workdir = '' -# drain_shape = os.path.join(workdir, 'gis_inputs', '') -# gauge_shape = '' -# obs_data_dir = '' -# hist_sim_nc = '' - -# COLOMBIA -# workdir = '/Users/rchales/data/saber/colombia-magdalena' -# drain_shape = os.path.join(workdir, 'gis_inputs', 'magdalena_dl_attrname_xy.json') -# gauge_shape = os.path.join(workdir, 'gis_inputs', 'ideam_stations.json') -# obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs') -# hist_sim_nc = os.path.join(workdir, 'data_inputs', 'south_america_era5_qout.nc') - - -# Prepare the working directory - only need to do this step 1x ever -# saber.prep.scaffold_working_directory(workdir) -# Scripts not provided. Consult README.md for instructions -# Create the gauge_table.csv and drain_table.csv -# Put the historical simulation netCDF in the right folder -# Put the observed data csv files in the data_inputs/obs_csvs folder - -# Prepare the observation and simulation data - Only need to do this step 1x ever -print('Preparing data') -saber.prep.hindcast(workdir) - -# Generate the assignments table -print('Generate Assignment Table') -assign_table = saber.table.gen(workdir) -saber.table.cache(workdir, assign_table) - -# Generate the clusters using the historical simulation data -print('Generate Clusters') -saber.cluster.generate(workdir) -assign_table = saber.cluster.summarize(workdir, assign_table) -saber.table.cache(workdir, assign_table) - -# Assign basins which are gauged and propagate those gauges -print('Making Assignments') -assign_table = saber.assign.gauged(assign_table) -assign_table = saber.assign.propagation(assign_table) -assign_table = saber.assign.clusters_by_dist(assign_table) - -# Cache the assignments table with the updates -saber.table.cache(workdir, assign_table) - -# Generate GIS files so you can go explore your progress graphically -print('Generate GIS files') -saber.gis.clip_by_assignment(workdir, assign_table, drain_shape) -saber.gis.clip_by_cluster(workdir, assign_table, drain_shape) -saber.gis.clip_by_unassigned(workdir, assign_table, drain_shape) - -# Compute the corrected simulation data -print('Starting Calibration') -saber.calibrate_region(workdir, assign_table) - -# run the validation study -print('Performing Validation') -saber.validate.sample_gauges(workdir, overwrite=True) -saber.validate.run_series(workdir, drain_shape, obs_data_dir) -vtab = saber.validate.gen_val_table(workdir) -saber.gis.validation_maps(workdir, gauge_shape, vtab) From 13d0f5d97894ec952a6dcd76e5c3885267bfe312 Mon Sep 17 00:00:00 2001 From: rileyhales Date: Tue, 23 Aug 2022 20:50:55 -0600 Subject: [PATCH 11/24] new example script --- README.md | 8 ++------ examples/saber_example.py | 43 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 6 deletions(-) create mode 100644 examples/saber_example.py diff --git a/README.md b/README.md index 9951d5f..7d639d0 100755 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ Your table should look like this: | unique_stream_# | unique_stream_# | area in km^2 | stream_order | ## | ## | | ... | ... | ... | ... | ... | ... | -2. Prepare a csv of the attribute table of the gauge locations shapefile. +1. Prepare a csv of the attribute table of the gauge locations shapefile. - You need the columns: - model_id - gauge_id @@ -157,7 +157,7 @@ looks like this: ```python import saber as saber workdir = '/path/to/project/directory/' -saber.prep.gen_assignments_table(workdir) +saber.table.gen(workdir) ``` Your project's working directory now looks like @@ -230,10 +230,6 @@ saber.prep.hindcast( workdir, '/path/to/historical/simulation/netcdf.nc' # optional - if nc not stored in data_inputs folder ) -saber.prep.hist_sim_table( - workdir, - '/path/to/historical/simulation/netcdf.nc' # optional - if nc not stored in data_inputs folder -) saber.prep.observed_data( workdir, '/path/to/obs/csv/directory' # optional - if csvs not stored in workdir/data_inputs/obs_csvs diff --git a/examples/saber_example.py b/examples/saber_example.py new file mode 100644 index 0000000..44c57a3 --- /dev/null +++ b/examples/saber_example.py @@ -0,0 +1,43 @@ +import os + +import numpy as np +import saber + + +np.seterr(all="ignore") + +workdir = '' +hist_sim_nc = os.path.join(workdir, '') +obs_data_dir = os.path.join(workdir, '') +drain_gis = os.path.join(workdir, '') +gauge_gis = os.path.join(workdir, '') + +print('Prepare inputs') +saber.prep.gis_tables(workdir, drain_gis=drain_gis, gauge_gis=gauge_gis) +saber.prep.hindcast(workdir) + +# Generate the assignments table +print('Generate Assignment Table') +assign_table = saber.table.gen(workdir) +saber.table.cache(workdir, assign_table) + +# Generate clusters using the historical simulation data +print('Generate Clusters') +saber.cluster.generate(workdir) +saber.cluster.plot(workdir) +saber.cluster.summarize(workdir, assign_table) + +# Assign basins which are gauged and propagate those gauges +print('Making Assignments') +assign_table = saber.assign.gauged(assign_table) +assign_table = saber.assign.propagation(assign_table) +assign_table = saber.assign.clusters_by_dist(assign_table) + +# Cache the assignments table with the updates +saber.table.cache(workdir, assign_table) + +# Generate GIS files to explore your progress graphically +print('Generate GIS files') +saber.gis.clip_by_assignment(workdir, assign_table, drain_gis) +saber.gis.clip_by_cluster(workdir, assign_table, drain_gis) +saber.gis.clip_by_unassigned(workdir, assign_table, drain_gis) From b22d6dee4db2e963c4a1a6640b01d13fdda36a81 Mon Sep 17 00:00:00 2001 From: rileyhales Date: Thu, 25 Aug 2022 00:47:52 -0600 Subject: [PATCH 12/24] drop tslearn, fdc calculated in 1 percent intervals --- README.md | 9 +----- conda-environment.yml | 1 - requirements.txt | 1 - saber/cluster.py | 30 ++++++++++--------- saber/prep.py | 28 +++++++++--------- saber/table.py | 6 ++-- saber/tests/calibrate.py | 63 ---------------------------------------- saber/validate.py | 25 ++-------------- 8 files changed, 36 insertions(+), 127 deletions(-) delete mode 100644 saber/tests/calibrate.py diff --git a/README.md b/README.md index 7d639d0..19a7c1e 100755 --- a/README.md +++ b/README.md @@ -38,18 +38,11 @@ Things to check when preparing your data ## Process ### 1 Create a Working Directory -```python -import saber - -workdir = '/my/file/path' -saber.prep.scaffold_workdir(workdir) -``` - Your working directory should exactly like this. ``` working_directory - data_inputs/ tables/ + data_inputs/ kmeans_outputs/ gis_outputs/ ``` diff --git a/conda-environment.yml b/conda-environment.yml index 2a9783c..be858a9 100644 --- a/conda-environment.yml +++ b/conda-environment.yml @@ -10,7 +10,6 @@ dependencies: - numba - contextily - netCDF4 - - tslearn - pyarrow - kneed - tslearn diff --git a/requirements.txt b/requirements.txt index 99ea19d..6cf092b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,6 @@ netCDF4 tslearn pyarrow kneed -tslearn matplotlib requests scipy diff --git a/saber/cluster.py b/saber/cluster.py index a57ecc8..f83aa9e 100755 --- a/saber/cluster.py +++ b/saber/cluster.py @@ -7,7 +7,8 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from tslearn.clustering import TimeSeriesKMeans as Cluster +from sklearn.cluster import KMeans as Clusterer +import joblib from natsort import natsorted from ._vocab import cluster_count_file @@ -24,27 +25,29 @@ def generate(workdir: str) -> None: Returns: None """ - inertia = {'number': [], 'inertia': [], 'n_iter': []} + summary = {'number': [], 'inertia': [], 'n_iter': [], 'pseudo_aic': []} # read the prepared data (array x) x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')) x = x.values + n_feat = x.shape[0] # build the kmeans model for a range of cluster numbers - for n_clusters in range(1, 17): + for n_clusters in range(1, 16): print(n_clusters) - ks = Cluster(n_clusters=n_clusters, max_iter=150) - ks.fit_predict(x) - ks.to_pickle(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')) - inertia['number'].append(n_clusters) - inertia['inertia'].append(ks.inertia_) - inertia['n_iter'].append(ks.n_iter_) + kmeans = Clusterer(n_clusters=n_clusters) + kmeans.fit_predict(x) + joblib.dump(kmeans, os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')) + summary['number'].append(n_clusters) + summary['inertia'].append(kmeans.inertia_) + summary['pseudo_aic'].append(np.log(kmeans.inertia_ / n_feat) + ((n_feat + 2 * n_clusters) / (n_feat))) + summary['n_iter'].append(kmeans.n_iter_) # save the inertia results as a csv - pd.DataFrame.from_dict(inertia).to_csv(os.path.join(workdir, 'kmeans_outputs', f'cluster-inertia.csv')) + pd.DataFrame.from_dict(summary).to_csv(os.path.join(workdir, 'kmeans_outputs', f'clustering-summary-stats.csv')) # find the knee/elbow - knee = kneed.KneeLocator(inertia['number'], inertia['inertia'], curve='convex', direction='decreasing').knee + knee = kneed.KneeLocator(summary['number'], summary['inertia'], curve='convex', direction='decreasing').knee # save the best fitting cluster counts to a csv with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'w') as f: @@ -74,7 +77,7 @@ def plot(workdir: str) -> None: x_ticks = np.linspace(0, 100, 5).astype(int) for model_pickle in natsorted(glob.glob(os.path.join(workdir, 'kmeans_outputs', 'kmeans-*.pickle'))): - kmeans = Cluster.from_pickle(model_pickle) + kmeans = joblib.load(model_pickle) n_clusters = int(kmeans.n_clusters) n_cols = min(n_clusters, max_cols) n_rows = math.ceil(n_clusters / n_cols) @@ -122,7 +125,8 @@ def summarize(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) # create a dataframe with the optimal model's labels and the model_id's df = pd.DataFrame({ - 'cluster': Cluster.from_pickle(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_.flatten(), + 'cluster': Clusterer.from_pickle( + os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_.flatten(), mid_col: pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten() }, dtype=str) diff --git a/saber/prep.py b/saber/prep.py index 0890bd3..a40f332 100755 --- a/saber/prep.py +++ b/saber/prep.py @@ -4,14 +4,14 @@ import geopandas as gpd import netCDF4 as nc import numpy as np -from tslearn.preprocessing import TimeSeriesScalerMeanVariance +from sklearn.preprocessing import StandardScaler as Scalar from ._vocab import mid_col from ._vocab import read_drain_table from ._vocab import guess_hindcast_path from ._vocab import get_table_path -__all__ = ['gis_tables', 'hindcast', 'scaffold_workdir'] +__all__ = ['gis_tables', 'hindcast', 'workdir'] def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> None: @@ -27,8 +27,7 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No None """ if gauge_gis is not None: - gdf = gpd.read_file(gauge_gis).drop('geometry', axis=1) - pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet( + pd.DataFrame(gpd.read_file(gauge_gis).drop('geometry', axis=1)).to_parquet( os.path.join(workdir, 'tables', 'gauge_table.parquet')) if drain_gis is not None: gdf = gpd.read_file(drain_gis) @@ -78,37 +77,36 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None: df.to_parquet(get_table_path(workdir, 'hindcast_series')) # calculate the FDC and save to parquet - exceed_prob = np.linspace(0, 100, 201)[::-1] + exceed_prob = np.linspace(0, 100, 101)[::-1] df = df.apply(lambda x: np.transpose(np.nanpercentile(x, exceed_prob))) df.index = exceed_prob df.index.name = 'exceed_prob' df.to_parquet(get_table_path(workdir, 'hindcast_fdc')) # transform and prepare for clustering - df = pd.DataFrame(np.squeeze(TimeSeriesScalerMeanVariance().fit_transform(np.squeeze(np.transpose(df.values))))) + df = pd.DataFrame(np.transpose(Scalar().fit_transform(np.squeeze(df.values)))) df.index = ids df.columns = df.columns.astype(str) df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')) return -def scaffold_workdir(path: str, include_validation: bool = True) -> None: +def workdir(path: str, include_validation: bool = True) -> None: """ Creates the correct directories for a Saber project within the specified directory Args: - path: the path to a directory where you want to create directories + path: the path to a directory where you want to create workdir subdirectories include_validation: boolean, indicates whether to create the validation folder Returns: None """ - if not os.path.isdir(path): - os.mkdir(path) - os.mkdir(os.path.join(path, 'tables')) - os.mkdir(os.path.join(path, 'inputs')) - os.mkdir(os.path.join(path, 'gis_outputs')) - os.mkdir(os.path.join(path, 'kmeans_outputs')) + dir_list = ['tables', 'data_inputs', 'gis_outputs', 'kmeans_outputs'] if include_validation: - os.mkdir(os.path.join(path, 'validation_runs')) + dir_list.append('validation_runs') + for d in dir_list: + p = os.path.join(path, d) + if not os.path.exists(p): + os.mkdir(p) return diff --git a/saber/table.py b/saber/table.py index 01d9c9b..f1aafe4 100644 --- a/saber/table.py +++ b/saber/table.py @@ -35,8 +35,8 @@ def gen(workdir) -> pd.DataFrame: None """ # read and merge the tables - drain_df = pd.read_parquet(os.path.join(workdir, 'tables', 'drain_table.parquet.gzip')) - gauge_df = pd.read_parquet(os.path.join(workdir, 'tables', 'gauge_table.parquet.gzip')) + drain_df = pd.read_parquet(os.path.join(workdir, 'tables', 'drain_table.parquet')) + gauge_df = pd.read_parquet(os.path.join(workdir, 'tables', 'gauge_table.parquet')) assign_table = pd.merge(drain_df, gauge_df, on=mid_col, how='outer') # create the new columns @@ -75,6 +75,6 @@ def cache(workdir: str, assign_table: pd.DataFrame) -> None: Returns: None """ - assign_table.to_parquet(get_path(workdir), compression='gzip') + assign_table.to_parquet(get_path(workdir)) return diff --git a/saber/tests/calibrate.py b/saber/tests/calibrate.py deleted file mode 100644 index c58fa40..0000000 --- a/saber/tests/calibrate.py +++ /dev/null @@ -1,63 +0,0 @@ -from io import StringIO - -import geoglows -import pandas as pd -import requests - - -def collect_data(start_id, start_ideam_id, downstream_id, downstream_ideam_id): - # Upstream simulated flow - start = geoglows.streamflow.historic_simulation(start_id) - # Upstream observed flow - start_ideam = get_ideam_flow(start_ideam_id) - start_ideam.dropna(inplace=True) - - # Downstream simulated flow - downstream = geoglows.streamflow.historic_simulation(downstream_id) - # Downstream observed flow - downstream_ideam = get_ideam_flow(downstream_ideam_id) - downstream_ideam.dropna(inplace=True) - # Downstream bias corrected flow (for comparison to the data_4_assign_propagation method - downstream_bc = geoglows.bias.correct_historical(downstream, downstream_ideam) - - # Export all as csv - start.to_csv('start_flow.csv') - start_ideam.to_csv('start_ideam_flow.csv') - downstream.to_csv('downstream_flow.csv') - downstream_ideam.to_csv('downstream_ideam_flow.csv') - downstream_bc.to_csv('downstream_bc_flow.csv') - return - - -def get_ideam_flow(id): - # get the gauged data - url = f'https://www.hydroshare.org/resource/d222676fbd984a81911761ca1ba936bf/' \ - f'data/contents/Discharge_Data/{id}.csv' - df = pd.read_csv(StringIO(requests.get(url).text), index_col=0) - df.index = pd.to_datetime(df.index).tz_localize('UTC') - return df - -# collect_data(9012999, 22057070, 9012650, 22057010) -# collect_data(9017261, 32037030, 9015333, 32097010) # really long range down stream -# collect_data(9009660, 21237020, 9007292, 23097040) # large river -# collect_data(9007292, 23097040, 9009660, 21237020) # large river backwards (going upstream) - -# Read all as csv -start_flow = pd.read_csv('data_4_assign_propagation/start_flow.csv', index_col=0) -start_ideam_flow = pd.read_csv('data_4_assign_propagation/start_ideam_flow.csv', index_col=0) -downstream_flow = pd.read_csv('data_4_assign_propagation/downstream_flow.csv', index_col=0) -downstream_ideam_flow = pd.read_csv('data_4_assign_propagation/downstream_ideam_flow.csv', index_col=0) -downstream_bc_flow = pd.read_csv('data_4_assign_propagation/downstream_bc_flow.csv', index_col=0) -start_flow.index = pd.to_datetime(start_flow.index) -start_ideam_flow.index = pd.to_datetime(start_ideam_flow.index) -downstream_flow.index = pd.to_datetime(downstream_flow.index) -downstream_ideam_flow.index = pd.to_datetime(downstream_ideam_flow.index) -downstream_bc_flow.index = pd.to_datetime(downstream_bc_flow.index) - -downstream_prop_correct = rbc_calibrate(start_flow, start_ideam_flow, downstream_flow, - fit_gumbel=True, gumbel_range=(25, 75)) -plot_results(downstream_flow, downstream_ideam_flow, downstream_bc_flow, downstream_prop_correct, - f'Correct Monthly - Force Gumbel Distribution') -del downstream_prop_correct['Scalars'], downstream_prop_correct['Percentile'] -statistics_tables(downstream_prop_correct, downstream_flow, downstream_ideam_flow).to_csv( - 'data_4_assign_propagation/stats_test.csv') diff --git a/saber/validate.py b/saber/validate.py index f4969a2..c27198f 100644 --- a/saber/validate.py +++ b/saber/validate.py @@ -6,8 +6,7 @@ import netCDF4 as nc import numpy as np -from .prep import scaffold_workdir -from ._workflow import analyze_region +from .prep import workdir from ._vocab import mid_col from ._vocab import gid_col from ._vocab import metric_nc_name_list @@ -45,7 +44,7 @@ def sample_gauges(workdir: str, overwrite: bool = False) -> None: # create the new project working directory os.mkdir(subpath) - scaffold_workdir(subpath, include_validation=False) + workdir(subpath, include_validation=False) # overwrite the processed data directory so we don't need to redo this each time shutil.copytree( @@ -70,26 +69,6 @@ def sample_gauges(workdir: str, overwrite: bool = False) -> None: return -def run_series(workdir: str, drain_shape: str, obs_data_dir: str = None) -> None: - """ - Runs saber.analyze_region on each project in the validation_runs directory - - Args: - workdir: the project working directory - drain_shape: path to the drainage line gis file - obs_data_dir: path to the directory containing the observed data - - Returns: - None - """ - gauge_table = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'gauge_table.csv')) - val_workdirs = [i for i in glob.glob(os.path.join(workdir, 'validation_runs', '*')) if os.path.isdir(i)] - for val_workdir in val_workdirs: - print(f'\n\n\t\t\tworking on {val_workdir}\n\n') - analyze_region(val_workdir, drain_shape, gauge_table, obs_data_dir) - return - - def gen_val_table(workdir: str) -> pd.DataFrame: """ Prepares the validation summary table that contains the list of gauged rivers plus their statistics computed in From b7f3402c7b49758f4d399b18b193fd44e3c1d1f6 Mon Sep 17 00:00:00 2001 From: rileyhales Date: Thu, 25 Aug 2022 00:49:31 -0600 Subject: [PATCH 13/24] drop geoglows dependency --- conda-environment.yml | 3 +-- requirements.txt | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/conda-environment.yml b/conda-environment.yml index be858a9..5111979 100644 --- a/conda-environment.yml +++ b/conda-environment.yml @@ -12,9 +12,8 @@ dependencies: - netCDF4 - pyarrow - kneed - - tslearn - matplotlib - - geoglows - requests - scipy - natsort + - scikit-learn diff --git a/requirements.txt b/requirements.txt index 6cf092b..7d43255 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,10 +5,10 @@ numpy numba contextily netCDF4 -tslearn pyarrow kneed matplotlib requests scipy -natsort \ No newline at end of file +natsort +scikit-learn \ No newline at end of file From 14c24a0fc71a6c20fdfb25c47b33fd6f4a02345c Mon Sep 17 00:00:00 2001 From: rileyhales Date: Thu, 25 Aug 2022 16:15:28 -0600 Subject: [PATCH 14/24] split cluster functions, code formatting, mark todo's --- README.md | 34 +++++----- saber/_vocab.py | 5 +- saber/assign.py | 11 ++-- saber/cluster.py | 162 ++++++++++++++++++++++++++++++++++++++-------- saber/gis.py | 55 ++++++++-------- saber/prep.py | 18 ++++-- saber/table.py | 4 +- saber/validate.py | 7 +- 8 files changed, 203 insertions(+), 93 deletions(-) diff --git a/README.md b/README.md index 19a7c1e..e4016a9 100755 --- a/README.md +++ b/README.md @@ -16,10 +16,10 @@ You need the following data to follow this procedure. Geopackage format GIS data equivalent formats that can be read by `geopandas`. 1. Geopackage drainage Lines (usually delineated center lines) and catchments/subbasins (polygons) in the watershed. The attribute tables for both should contain (at least) the following entries for each feature: - - An identifier column (alphanumeric) labeled `model_id`. - - The ID of the next downstream reach/subbasin - - The stream order of each reach/subbasin - - Cumulative upstream drainage area + - An identifier column (alphanumeric) labeled `model_id` + - The ID of the next downstream reach/subbasin labeled `downstream_model_id` + - The stream order of each reach/subbasin labeled `order` + - Cumulative upstream drainage area labeled - The x coordinate of the centroid of each feature (precalculated for faster results later) - The y coordinate of the centroid of each feature (precalculated for faster results later) 2. Geopackage points representing the location of each of the river gauging stations available. The attribute table @@ -48,7 +48,7 @@ working_directory ``` ### 2 Prepare Spatial Data (scripts not provided) -This step instructs you to collect 3 gis files and use them to generate 2 csv tables. All 5 files (3 gis files and 2 +This step instructs you to collect 3 gis files and use them to generate 2 tables. All 5 files (3 gis files and 2 tables) should go in the `gis_inputs` directory 1. Clip model drainage lines and catchments shapefile to extents of the region of interest. @@ -90,12 +90,12 @@ gdf.to_file('/file/path/to/save', driver='GeoJSON') Your table should look like this: -| downstream_model_id | model_id | drainage_area_mod | stream_order | x | y | -|---------------------|-----------------|-------------------|--------------|-----|-----| -| unique_stream_# | unique_stream_# | area in km^2 | stream_order | ## | ## | -| unique_stream_# | unique_stream_# | area in km^2 | stream_order | ## | ## | -| unique_stream_# | unique_stream_# | area in km^2 | stream_order | ## | ## | -| ... | ... | ... | ... | ... | ... | +| downstream_model_id | model_id | model_drain_area | stream_order | x | y | +|---------------------|-----------------|------------------|--------------|-----|-----| +| unique_stream_# | unique_stream_# | area in km^2 | stream_order | ## | ## | +| unique_stream_# | unique_stream_# | area in km^2 | stream_order | ## | ## | +| unique_stream_# | unique_stream_# | area in km^2 | stream_order | ## | ## | +| ... | ... | ... | ... | ... | ... | 1. Prepare a csv of the attribute table of the gauge locations shapefile. - You need the columns: @@ -105,12 +105,12 @@ Your table should look like this: Your table should look like this (column order is irrelevant): -| model_id | drainage_area_obs | gauge_id | -|-------------------|-------------------|------------------| -| unique_stream_num | area in km^2 | unique_gauge_num | -| unique_stream_num | area in km^2 | unique_gauge_num | -| unique_stream_num | area in km^2 | unique_gauge_num | -| ... | ... | ... | +| model_id | gauge_drain_area | gauge_id | +|-------------------|------------------|------------------| +| unique_stream_num | area in km^2 | unique_gauge_num | +| unique_stream_num | area in km^2 | unique_gauge_num | +| unique_stream_num | area in km^2 | unique_gauge_num | +| ... | ... | ... | Your project's working directory now looks like ``` diff --git a/saber/_vocab.py b/saber/_vocab.py index 923c3f1..43d130b 100644 --- a/saber/_vocab.py +++ b/saber/_vocab.py @@ -2,9 +2,10 @@ import glob import pandas as pd +# todo enforce using this module for retrieving names?? + # assign table and gis_input file required column names -# mid_col = 'model_id' -mid_col = 'COMID' +mid_col = 'model_id' gid_col = 'gauge_id' asgn_mid_col = 'assigned_model_id' asgn_gid_col = 'assigned_gauge_id' diff --git a/saber/assign.py b/saber/assign.py index 2f29353..9205793 100644 --- a/saber/assign.py +++ b/saber/assign.py @@ -1,16 +1,15 @@ import numpy as np import pandas as pd +from ._propagation import propagate_in_table from ._propagation import walk_downstream from ._propagation import walk_upstream -from ._propagation import propagate_in_table - -from ._vocab import mid_col -from ._vocab import gid_col -from ._vocab import asgn_mid_col from ._vocab import asgn_gid_col -from ._vocab import reason_col +from ._vocab import asgn_mid_col +from ._vocab import gid_col +from ._vocab import mid_col from ._vocab import order_col +from ._vocab import reason_col def gauged(df: pd.DataFrame) -> pd.DataFrame: diff --git a/saber/cluster.py b/saber/cluster.py index f83aa9e..b3c4f96 100755 --- a/saber/cluster.py +++ b/saber/cluster.py @@ -1,50 +1,97 @@ +import glob +import json import math import os -import json -import glob +from collections.abc import Iterable +import joblib import kneed import matplotlib.pyplot as plt import numpy as np import pandas as pd -from sklearn.cluster import KMeans as Clusterer -import joblib from natsort import natsorted +from sklearn.cluster import KMeans as Clusterer +from sklearn.metrics import silhouette_samples from ._vocab import cluster_count_file from ._vocab import mid_col +__all__ = ['generate', 'summarize', 'plot_clusters', 'plot_silhouette', 'merge_assign_table'] -def generate(workdir: str) -> None: + +def generate(workdir: str, max_clusters: int = 12) -> None: """ Trains kmeans clustering models, saves the model as pickle, generates images and supplementary files Args: workdir: path to the project directory + max_clusters: maximum number of clusters to train Returns: None """ - summary = {'number': [], 'inertia': [], 'n_iter': [], 'pseudo_aic': []} - # read the prepared data (array x) x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')) x = x.values - n_feat = x.shape[0] # build the kmeans model for a range of cluster numbers - for n_clusters in range(1, 16): + for n_clusters in range(2, max_clusters + 1): print(n_clusters) kmeans = Clusterer(n_clusters=n_clusters) kmeans.fit_predict(x) joblib.dump(kmeans, os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')) + return + + +def summarize(workdir: str) -> None: + """ + Generate a summary of the clustering results, calculate the silhouette score, save the centers and labels to parquet + + Args: + workdir: path to the project directory + + Returns: + + """ + summary = {'number': [], 'inertia': [], 'n_iter': [], 'silhouette': []} + silhouette_scores = [] + labels = [] + + # read the prepared data (array x) + x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')) + x = x.values + + for model_file in natsorted(glob.glob(os.path.join(workdir, 'kmeans_outputs', 'kmeans-*.pickle'))): + kmeans = joblib.load(model_file) + n_clusters = int(kmeans.n_clusters) + + # save the cluster centroids to table - columns are the cluster number, rows are the centroid FDC values + pd.DataFrame( + np.transpose(kmeans.cluster_centers_), + columns=np.array(range(n_clusters)).astype(str) + ).to_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}-centers.parquet')) + + # save the silhouette score for each cluster + silhouette_scores.append(silhouette_samples(x, kmeans.labels_).flatten()) + labels.append(kmeans.labels_.flatten()) + + # save the summary stats from this model summary['number'].append(n_clusters) summary['inertia'].append(kmeans.inertia_) - summary['pseudo_aic'].append(np.log(kmeans.inertia_ / n_feat) + ((n_feat + 2 * n_clusters) / (n_feat))) summary['n_iter'].append(kmeans.n_iter_) - - # save the inertia results as a csv - pd.DataFrame.from_dict(summary).to_csv(os.path.join(workdir, 'kmeans_outputs', f'clustering-summary-stats.csv')) + summary['silhouette'].append(np.mean(silhouette_scores[-1])) + + # save the summary results as a csv + pd.DataFrame.from_dict(summary) \ + .to_csv(os.path.join(workdir, 'kmeans_outputs', f'clustering-summary-stats.csv')) + # save the silhouette scores as a parquet + silhouette_scores = np.transpose(np.array(silhouette_scores)) + pd.DataFrame(silhouette_scores, columns=np.array(range(2, silhouette_scores.shape[1] + 2)).astype(str)) \ + .to_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-silhouette_scores.parquet')) + # save the labels as a parquet + labels = np.transpose(np.array(labels)) + pd.DataFrame(labels, columns=np.array(range(2, labels.shape[1] + 2)).astype(str)) \ + .to_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-labels.parquet')) # find the knee/elbow knee = kneed.KneeLocator(summary['number'], summary['inertia'], curve='convex', direction='decreasing').knee @@ -55,35 +102,52 @@ def generate(workdir: str) -> None: return -def plot(workdir: str) -> None: +def plot_clusters(workdir: str, clusters: int or Iterable = 'all', + max_cols: int = 3, plt_width: int = 3, plt_height: int = 3) -> None: """ Generate figures of the clustered FDC's Args: workdir: path to the project directory + clusters: number of clusters to create figures for + max_cols: maximum number of columns (subplots) in the figure + plt_width: width of each subplot in inches + plt_height: height of each subplot in inches Returns: None """ - # image generating params - img_width = 3 - img_height = 3 - max_cols = 3 - - # read the fdc's x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')).values size = x.shape[1] x_values = np.linspace(0, size, 5) x_ticks = np.linspace(0, 100, 5).astype(int) - for model_pickle in natsorted(glob.glob(os.path.join(workdir, 'kmeans_outputs', 'kmeans-*.pickle'))): - kmeans = joblib.load(model_pickle) + kmeans_dir = os.path.join(workdir, 'kmeans_outputs') + if clusters == 'all': + model_files = natsorted(glob.glob(os.path.join(kmeans_dir, 'kmeans-*.pickle'))) + elif isinstance(clusters, int): + model_files = glob.glob(os.path.join(kmeans_dir, f'kmeans-{clusters}.pickle')) + elif isinstance(clusters, Iterable): + model_files = natsorted([os.path.join(kmeans_dir, f'kmeans-{i}.pickle') for i in clusters]) + else: + raise TypeError('n_clusters should be of type int or an iterable') + + for model_file in model_files: + # todo read the cluster centers from the parquet file instead of the model pickle + kmeans = joblib.load(model_file) n_clusters = int(kmeans.n_clusters) n_cols = min(n_clusters, max_cols) n_rows = math.ceil(n_clusters / n_cols) - fig, axs = plt.subplots(n_rows, n_cols, figsize=(img_width * n_cols + 1, img_height * n_rows + 1), dpi=800, - squeeze=False, tight_layout=True, sharey=True) + fig, axs = plt.subplots( + n_rows, + n_cols, + figsize=(plt_width * n_cols + 1, plt_height * n_rows + 1), + dpi=500, + squeeze=False, + tight_layout=True, + sharey=True + ) fig.suptitle("KMeans FDC Clustering") fig.supxlabel('Exceedance Probability (%)') fig.supylabel('Discharge Z-Score') @@ -94,7 +158,7 @@ def plot(workdir: str) -> None: ax.set_xticks(x_values, x_ticks) ax.set_ylim(-2, 4) for j in x[kmeans.labels_ == i]: - ax.plot(j.ravel(), "k-", alpha=.15) + ax.plot(j.ravel(), "k-") ax.plot(kmeans.cluster_centers_[i].flatten(), "r-") # turn off plotting axes which are blank - made for the square grid but > n_clusters for ax in fig.axes[n_clusters:]: @@ -105,7 +169,50 @@ def plot(workdir: str) -> None: return -def summarize(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) -> pd.DataFrame: +def plot_silhouette(workdir: str, clusters: int or Iterable = 'all', + plt_width: int = 3, plt_height: int = 3) -> None: + """ + Plot the silhouette scores for each cluster. + Based on https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html + + Args: + workdir: path to the project directory + clusters: number of clusters to create figures for + plt_width: width of each subplot in inches + plt_height: height of each subplot in inches + + Returns: + None + """ + kmeans_dir = os.path.join(workdir, 'kmeans_outputs') + if clusters == 'all': + sscore_files = natsorted(glob.glob(os.path.join(kmeans_dir, 'kmeans-*-silscores.parquet'))) + elif isinstance(clusters, int): + sscore_files = glob.glob(os.path.join(kmeans_dir, f'kmeans-{clusters}-silscores.parquet')) + elif isinstance(clusters, Iterable): + sscore_files = natsorted([os.path.join(kmeans_dir, f'kmeans-{i}-silscores.parquet') for i in clusters]) + else: + raise TypeError('n_clusters should be of type int or an iterable') + + for sscore_file in sscore_files: + # todo plot the silhouette scores + n_clusters = int(os.path.basename(sscore_file).split('-')[1]) + centers_df = pd.read_parquet(os.path.join(kmeans_dir, f'kmeans-{n_clusters}-centers.parquet')) + silscores_df = pd.read_parquet(sscore_file) + + for cluster_num in centers_df.columns: + # Create a subplot with 1 row and 2 columns + fig, (ax1, ax2) = plt.subplots( + 1, + 2, + figsize=(plt_width * 2 + 1, plt_height + 1), + dpi=500, + squeeze=False, + tight_layout=True, + ) + + +def merge_assign_table(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) -> pd.DataFrame: """ Creates a csv listing the streams assigned to each cluster in workdir/kmeans_models and also adds that information to assign_table.csv @@ -118,6 +225,7 @@ def summarize(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) Returns: None """ + # todo move to assign module if n_clusters is None: # read the cluster results csv with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'r') as f: @@ -125,7 +233,7 @@ def summarize(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) # create a dataframe with the optimal model's labels and the model_id's df = pd.DataFrame({ - 'cluster': Clusterer.from_pickle( + 'cluster': joblib.load( os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_.flatten(), mid_col: pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten() }, dtype=str) diff --git a/saber/gis.py b/saber/gis.py index 2328171..14d8cc3 100644 --- a/saber/gis.py +++ b/saber/gis.py @@ -1,24 +1,23 @@ import os import warnings +import contextily as cx import geopandas as gpd +import matplotlib as mpl +import matplotlib.pyplot as plt import numpy as np import pandas as pd -from ._vocab import mid_col from ._vocab import gid_col -from ._vocab import reason_col from ._vocab import metric_nc_name_list - -import matplotlib as mpl -import matplotlib.pyplot as plt -import contextily as cx +from ._vocab import mid_col +from ._vocab import reason_col __all__ = ['generate_all', 'clip_by_assignment', 'clip_by_cluster', 'clip_by_unassigned', 'clip_by_ids', 'validation_maps'] -def generate_all(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None: +def generate_all(workdir: str, assign_table: pd.DataFrame, drain_gis: str, prefix: str = '') -> None: """ Runs all the clip functions which create subsets of the drainage lines GIS dataset based on how they were assigned for bias correction. @@ -26,33 +25,33 @@ def generate_all(workdir: str, assign_table: pd.DataFrame, drain_shape: str, pre Args: workdir: the path to the working directory for the project assign_table: the assign_table dataframe - drain_shape: path to a drainage line shapefile which can be clipped + drain_gis: path to a drainage line shapefile which can be clipped prefix: a prefix for names of the outputs to distinguish between data generated in separate instances Returns: None """ - clip_by_assignment(workdir, assign_table, drain_shape, prefix) - clip_by_cluster(workdir, assign_table, drain_shape, prefix) - clip_by_unassigned(workdir, assign_table, drain_shape, prefix) + clip_by_assignment(workdir, assign_table, drain_gis, prefix) + clip_by_cluster(workdir, assign_table, drain_gis, prefix) + clip_by_unassigned(workdir, assign_table, drain_gis, prefix) return -def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None: +def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_gis: str, prefix: str = '') -> None: """ Creates Geopackage files in workdir/gis_outputs for each unique value in the assignment column Args: workdir: the path to the working directory for the project assign_table: the assign_table dataframe - drain_shape: path to a drainage line shapefile which can be clipped + drain_gis: path to a drainage line shapefile which can be clipped prefix: a prefix for names of the outputs to distinguish between data generated at separate instances Returns: None """ # read the drainage line shapefile - dl = gpd.read_file(drain_shape) + dl = gpd.read_file(drain_gis) save_dir = os.path.join(workdir, 'gis_outputs') # get the unique list of assignment reasons @@ -67,20 +66,20 @@ def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_shape: st return -def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None: +def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_gis: str, prefix: str = '') -> None: """ Creates Geopackage files in workdir/gis_outputs of the drainage lines based on the fdc cluster they were assigned to Args: workdir: the path to the working directory for the project assign_table: the assign_table dataframe - drain_shape: path to a drainage line shapefile which can be clipped + drain_gis: path to a drainage line shapefile which can be clipped prefix: optional, a prefix to prepend to each created file's name Returns: None """ - dl_gdf = gpd.read_file(drain_shape) + dl_gdf = gpd.read_file(drain_gis) for num in sorted(set(assign_table[mid_col].dropna().values)): gdf = dl_gdf[dl_gdf[mid_col].isin(assign_table[assign_table[mid_col] == num][mid_col])] if gdf.empty: @@ -89,20 +88,20 @@ def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_shape: str, return -def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_shape: str, prefix: str = '') -> None: +def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_gis: str, prefix: str = '') -> None: """ Creates Geopackage files in workdir/gis_outputs of the drainage lines which haven't been assigned a gauge yet Args: workdir: the path to the working directory for the project assign_table: the assign_table dataframe - drain_shape: path to a drainage line shapefile which can be clipped + drain_gis: path to a drainage line shapefile which can be clipped prefix: optional, a prefix to prepend to each created file's name Returns: None """ - dl_gdf = gpd.read_file(drain_shape) + dl_gdf = gpd.read_file(drain_gis) ids = assign_table[assign_table[reason_col].isna()][mid_col].values subset = dl_gdf[dl_gdf[mid_col].isin(ids)] if subset.empty: @@ -113,29 +112,28 @@ def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_shape: st return -def clip_by_ids(workdir: str, ids: list, drain_shape: str, prefix: str = '', - id_column: str = mid_col) -> None: +def clip_by_ids(workdir: str, ids: list, drain_gis: str, prefix: str = '', id_column: str = mid_col) -> None: """ Creates Geopackage files in workdir/gis_outputs of the subset of 'drain_shape' with an ID in the specified list Args: workdir: path to the project directory ids: any iterable containing a series of model_ids - drain_shape: path to the drainage shapefile to be clipped + drain_gis: path to the drainage shapefile to be clipped prefix: optional, a prefix to prepend to each created file's name id_column: name of the id column in the attributes of the shape table Returns: None """ - dl = gpd.read_file(drain_shape) + dl = gpd.read_file(drain_gis) save_dir = os.path.join(workdir, 'gis_outputs') name = f'{prefix}{"_" if prefix else ""}id_subset.gpkg' dl[dl[id_column].isin(ids)].to_file(os.path.join(save_dir, name)) return -def validation_maps(workdir: str, gauge_shape: str, val_table: pd.DataFrame = None, prefix: str = '') -> None: +def validation_maps(workdir: str, gauge_gis: str, val_table: pd.DataFrame = None, prefix: str = '') -> None: """ Creates Geopackage files in workdir/gis_outputs of subsets of the gauge_shape. 1 is the fill gauge shape with added attribute columns for all the computed stats. There are 2 for each of the 5 @@ -145,7 +143,7 @@ def validation_maps(workdir: str, gauge_shape: str, val_table: pd.DataFrame = No Args: workdir: path to the project directory val_table: the validation table produced by saber.validate - gauge_shape: path to the gauge locations shapefile + gauge_gis: path to the gauge locations shapefile prefix: optional, a prefix to prepend to each created file's name Returns: @@ -156,7 +154,7 @@ def validation_maps(workdir: str, gauge_shape: str, val_table: pd.DataFrame = No save_dir = os.path.join(workdir, 'gis_outputs') # merge gauge table with the validation table - gdf = gpd.read_file(gauge_shape) + gdf = gpd.read_file(gauge_gis) gdf = gdf.merge(val_table, on=gid_col, how='inner') gdf.to_file(os.path.join(save_dir, 'gauges_with_validation_stats.gpkg')) @@ -230,8 +228,7 @@ def histomaps(gdf: gpd.GeoDataFrame, metric: str, prct: str, workdir: str) -> No axm.set_xlabel('Longitude') axm.set_xticks([]) axm.set_yticks([]) - gdf[core_columns + [metric, ]].to_crs(epsg=3857).plot( - metric, ax=axm, cmap=cmap, norm=norm, legend=True, markersize=10) + gdf[core_columns + [metric, ]].to_crs(epsg=3857).plot(metric) cx.add_basemap(ax=axm, zoom=9, source=cx.providers.Esri.WorldTopoMap, attribution='') fig.savefig(os.path.join(workdir, 'gis_outputs', f'{metric}_{prct}.png')) diff --git a/saber/prep.py b/saber/prep.py index a40f332..c0af844 100755 --- a/saber/prep.py +++ b/saber/prep.py @@ -1,15 +1,15 @@ import os -import pandas as pd import geopandas as gpd import netCDF4 as nc import numpy as np +import pandas as pd from sklearn.preprocessing import StandardScaler as Scalar +from ._vocab import get_table_path +from ._vocab import guess_hindcast_path from ._vocab import mid_col from ._vocab import read_drain_table -from ._vocab import guess_hindcast_path -from ._vocab import get_table_path __all__ = ['gis_tables', 'hindcast', 'workdir'] @@ -27,10 +27,16 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No None """ if gauge_gis is not None: - pd.DataFrame(gpd.read_file(gauge_gis).drop('geometry', axis=1)).to_parquet( - os.path.join(workdir, 'tables', 'gauge_table.parquet')) + if drain_gis.endswith('.parquet'): + gdf = gpd.read_parquet(gauge_gis) + else: + gdf = gpd.read_file(gauge_gis) + pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet(os.path.join(workdir, 'tables', 'gauge_table.parquet')) if drain_gis is not None: - gdf = gpd.read_file(drain_gis) + if drain_gis.endswith('.parquet'): + gdf = gpd.read_parquet(drain_gis) + else: + gdf = gpd.read_file(drain_gis) gdf['centroid_x'] = gdf.geometry.centroid.x gdf['centroid_y'] = gdf.geometry.centroid.y gdf = gdf.drop('geometry', axis=1) diff --git a/saber/table.py b/saber/table.py index f1aafe4..a1eaae5 100644 --- a/saber/table.py +++ b/saber/table.py @@ -3,9 +3,9 @@ import numpy as np import pandas as pd -from ._vocab import mid_col -from ._vocab import asgn_mid_col from ._vocab import asgn_gid_col +from ._vocab import asgn_mid_col +from ._vocab import mid_col from ._vocab import reason_col __all__ = ['get_path', 'read', 'cache', 'gen'] diff --git a/saber/validate.py b/saber/validate.py index c27198f..1de3c9b 100644 --- a/saber/validate.py +++ b/saber/validate.py @@ -2,15 +2,14 @@ import os import shutil -import pandas as pd import netCDF4 as nc import numpy as np +import pandas as pd -from .prep import workdir -from ._vocab import mid_col +from ._vocab import cal_nc_name from ._vocab import gid_col from ._vocab import metric_nc_name_list -from ._vocab import cal_nc_name +from ._vocab import mid_col def sample_gauges(workdir: str, overwrite: bool = False) -> None: From b03b98a0c92cb72aa7175a0d79665cb044857136 Mon Sep 17 00:00:00 2001 From: rileyhales Date: Thu, 25 Aug 2022 21:24:33 -0600 Subject: [PATCH 15/24] add silhouette plots --- saber/cluster.py | 94 +++++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 32 deletions(-) diff --git a/saber/cluster.py b/saber/cluster.py index b3c4f96..400f9ae 100755 --- a/saber/cluster.py +++ b/saber/cluster.py @@ -7,10 +7,11 @@ import joblib import kneed import matplotlib.pyplot as plt +import matplotlib.cm as cm import numpy as np import pandas as pd from natsort import natsorted -from sklearn.cluster import KMeans as Clusterer +from sklearn.cluster import KMeans from sklearn.metrics import silhouette_samples from ._vocab import cluster_count_file @@ -37,7 +38,7 @@ def generate(workdir: str, max_clusters: int = 12) -> None: # build the kmeans model for a range of cluster numbers for n_clusters in range(2, max_clusters + 1): print(n_clusters) - kmeans = Clusterer(n_clusters=n_clusters) + kmeans = KMeans(n_clusters=n_clusters) kmeans.fit_predict(x) joblib.dump(kmeans, os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')) return @@ -69,7 +70,7 @@ def summarize(workdir: str) -> None: pd.DataFrame( np.transpose(kmeans.cluster_centers_), columns=np.array(range(n_clusters)).astype(str) - ).to_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}-centers.parquet')) + ).to_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-centers-{n_clusters}.parquet')) # save the silhouette score for each cluster silhouette_scores.append(silhouette_samples(x, kmeans.labels_).flatten()) @@ -153,7 +154,7 @@ def plot_clusters(workdir: str, clusters: int or Iterable = 'all', fig.supylabel('Discharge Z-Score') for i, ax in enumerate(fig.axes[:n_clusters]): - ax.set_title(f'Cluster {i + 1}') + ax.set_title(f'Cluster {i + 1} (n = {np.sum(kmeans.labels_ == i)})') ax.set_xlim(0, size) ax.set_xticks(x_values, x_ticks) ax.set_ylim(-2, 4) @@ -164,52 +165,81 @@ def plot_clusters(workdir: str, clusters: int or Iterable = 'all', for ax in fig.axes[n_clusters:]: ax.axis('off') - fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.png')) + fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'kmeans-clusters-{n_clusters}.png')) plt.close(fig) return -def plot_silhouette(workdir: str, clusters: int or Iterable = 'all', - plt_width: int = 3, plt_height: int = 3) -> None: +def plot_silhouette(workdir: str, plt_width: int = 3, plt_height: int = 3) -> None: """ Plot the silhouette scores for each cluster. Based on https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html Args: workdir: path to the project directory - clusters: number of clusters to create figures for plt_width: width of each subplot in inches plt_height: height of each subplot in inches Returns: None """ - kmeans_dir = os.path.join(workdir, 'kmeans_outputs') - if clusters == 'all': - sscore_files = natsorted(glob.glob(os.path.join(kmeans_dir, 'kmeans-*-silscores.parquet'))) - elif isinstance(clusters, int): - sscore_files = glob.glob(os.path.join(kmeans_dir, f'kmeans-{clusters}-silscores.parquet')) - elif isinstance(clusters, Iterable): - sscore_files = natsorted([os.path.join(kmeans_dir, f'kmeans-{i}-silscores.parquet') for i in clusters]) - else: - raise TypeError('n_clusters should be of type int or an iterable') + labels_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-labels.parquet')) + silhouette_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-silhouette_scores.parquet')) + + for tot_clusters in silhouette_df.columns: + centers_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{tot_clusters}-centers.parquet')) + # Create a subplot with 1 row and 2 columns + fig, (ax1, ax2) = plt.subplots( + nrows=1, + ncols=2, + figsize=(plt_width * 2 + 1, plt_height + 1), + dpi=500, + tight_layout=True, + ) - for sscore_file in sscore_files: - # todo plot the silhouette scores - n_clusters = int(os.path.basename(sscore_file).split('-')[1]) - centers_df = pd.read_parquet(os.path.join(kmeans_dir, f'kmeans-{n_clusters}-centers.parquet')) - silscores_df = pd.read_parquet(sscore_file) - - for cluster_num in centers_df.columns: - # Create a subplot with 1 row and 2 columns - fig, (ax1, ax2) = plt.subplots( - 1, - 2, - figsize=(plt_width * 2 + 1, plt_height + 1), - dpi=500, - squeeze=False, - tight_layout=True, + ax1.set_title("Silhouette Plot") + ax1.set_xlabel("Silhouette Score") + ax1.set_ylabel("Cluster Label") + ax1.set_yticks([]) # Clear the yaxis labels / ticks + # ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) + + ax2.set_title("Cluster Centers") + ax2.set_xlabel("Exceedance Probability (%)") + ax2.set_ylabel("Discharge Z-Score") + + # The vertical line for average silhouette score of all the values + ax1.axvline(x=silhouette_df[tot_clusters].mean(), color="red", linestyle="--") + + y_lower = 10 + for sub_cluster in range(int(tot_clusters)): + # select the rows applicable to the current sub cluster + cluster_silhouettes = silhouette_df[labels_df[tot_clusters] == sub_cluster][tot_clusters].values.flatten() + cluster_silhouettes.sort() + + n = cluster_silhouettes.shape[0] + y_upper = y_lower + n + + color = cm.nipy_spectral(sub_cluster / int(tot_clusters)) + ax1.fill_betweenx( + np.arange(y_lower, y_upper), + 0, + cluster_silhouettes, + facecolor=color, + edgecolor=color, + alpha=0.7, ) + # Label the silhouette plots with their cluster numbers at the middle + ax1.text(-0.05, y_lower + 0.5 * n, str(sub_cluster + 1)) + + # plot the cluster center + ax2.plot(centers_df[f'{sub_cluster}'].values, alpha=0.7, c=color, label=f'Cluster {sub_cluster + 1}') + + # add some buffer before the next cluster + y_lower = y_upper + 10 + + fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'kmeans-silhouettes-{tot_clusters}.png')) + plt.close(fig) + return def merge_assign_table(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) -> pd.DataFrame: From 0fffc2c8ed994a9a203ea459bc4e6d011ac5c2ee Mon Sep 17 00:00:00 2001 From: rileyhales Date: Fri, 26 Aug 2022 00:05:52 -0600 Subject: [PATCH 16/24] merge assign and table modules --- examples/saber_example.py | 8 +-- saber/__init__.py | 15 ++--- saber/assign.py | 99 -------------------------------- saber/cluster.py | 40 ++----------- saber/table.py | 118 +++++++++++++++++++++++++++++++++++++- 5 files changed, 132 insertions(+), 148 deletions(-) delete mode 100644 saber/assign.py diff --git a/examples/saber_example.py b/examples/saber_example.py index 44c57a3..843f349 100644 --- a/examples/saber_example.py +++ b/examples/saber_example.py @@ -25,13 +25,13 @@ print('Generate Clusters') saber.cluster.generate(workdir) saber.cluster.plot(workdir) -saber.cluster.summarize(workdir, assign_table) # Assign basins which are gauged and propagate those gauges print('Making Assignments') -assign_table = saber.assign.gauged(assign_table) -assign_table = saber.assign.propagation(assign_table) -assign_table = saber.assign.clusters_by_dist(assign_table) +assign_table = saber.table.merge_clusters(workdir, assign_table, n_clusters=5) +assign_table = saber.table.assign_gauged(assign_table) +assign_table = saber.table.assign_propagation(assign_table) +assign_table = saber.table.assign_by_distance(assign_table) # Cache the assignments table with the updates saber.table.cache(workdir, assign_table) diff --git a/saber/__init__.py b/saber/__init__.py index d6a5a09..d05d5dc 100644 --- a/saber/__init__.py +++ b/saber/__init__.py @@ -1,19 +1,14 @@ -from saber._calibrate import calibrate, calibrate_region - -import saber.table -import saber.prep +import saber.analysis import saber.cluster -import saber.assign import saber.gis +import saber.prep +import saber.table import saber.utils import saber.validate -import saber.analysis - -import saber._vocab - +from saber._calibrate import calibrate, calibrate_region __all__ = ['calibrate', 'calibrate_region', - 'table', 'prep', 'assign', 'gis', 'cluster', 'utils', 'validate', 'analysis'] + 'table', 'prep', 'gis', 'cluster', 'utils', 'validate', 'analysis'] __author__ = 'Riley Hales' __version__ = '0.4.0' __license__ = 'BSD 3 Clause Clear' diff --git a/saber/assign.py b/saber/assign.py deleted file mode 100644 index 9205793..0000000 --- a/saber/assign.py +++ /dev/null @@ -1,99 +0,0 @@ -import numpy as np -import pandas as pd - -from ._propagation import propagate_in_table -from ._propagation import walk_downstream -from ._propagation import walk_upstream -from ._vocab import asgn_gid_col -from ._vocab import asgn_mid_col -from ._vocab import gid_col -from ._vocab import mid_col -from ._vocab import order_col -from ._vocab import reason_col - - -def gauged(df: pd.DataFrame) -> pd.DataFrame: - """ - Assigns basins a gauge for correction which contain a gauge - - Args: - df: the assignments table dataframe - - Returns: - Copy of df with assignments made - """ - _df = df.copy() - selector = ~_df[gid_col].isna() - _df.loc[selector, asgn_mid_col] = _df[mid_col] - _df.loc[selector, asgn_gid_col] = _df[gid_col] - _df.loc[selector, reason_col] = 'gauged' - return _df - - -def propagation(df: pd.DataFrame, max_prop: int = 5) -> pd.DataFrame: - """ - - Args: - df: the assignments table dataframe - max_prop: the max number of stream segments to propagate downstream - - Returns: - Copy of df with assignments made - """ - _df = df.copy() - for gauged_stream in _df.loc[~_df[gid_col].isna(), mid_col]: - subset = _df.loc[_df[mid_col] == gauged_stream, gid_col] - if subset.empty: - continue - start_gid = subset.values[0] - connected_segments = walk_upstream(df, gauged_stream, same_order=True) - _df = propagate_in_table(_df, gauged_stream, start_gid, connected_segments, max_prop, 'upstream') - connected_segments = walk_downstream(df, gauged_stream, same_order=True) - _df = propagate_in_table(_df, gauged_stream, start_gid, connected_segments, max_prop, 'downstream') - - return _df - - -def clusters_by_dist(df: pd.DataFrame) -> pd.DataFrame: - """ - Assigns all possible ungauged basins a gauge that is - (1) is closer than any other gauge - (2) is of same stream order as ungauged basin - (3) in the same simulated fdc cluster as ungauged basin - - Args: - df: the assignments table dataframe - - Returns: - Copy of df with assignments made - """ - _df = df.copy() - # first filter by cluster number - for c_num in sorted(set(_df['sim-fdc-cluster'].values)): - c_sub = _df[_df['sim-fdc-cluster'] == c_num] - # next filter by stream order - for so_num in sorted(set(c_sub[order_col])): - c_so_sub = c_sub[c_sub[order_col] == so_num] - - # determine which ids **need** to be assigned - ids_to_assign = c_so_sub[c_so_sub[asgn_mid_col].isna()][mid_col].values - avail_assigns = c_so_sub[c_so_sub[asgn_mid_col].notna()] - if ids_to_assign.size == 0 or avail_assigns.empty: - print(f'unable to assign cluster {c_num} to stream order {so_num}') - continue - # now you find the closest gauge to each unassigned - for id in ids_to_assign: - subset = c_so_sub.loc[c_so_sub[mid_col] == id, ['x', 'y']] - - dx = avail_assigns.x.values - subset.x.values - dy = avail_assigns.y.values - subset.y.values - avail_assigns['dist'] = np.sqrt(dx * dx + dy * dy) - row_idx_to_assign = avail_assigns['dist'].idxmin() - - mid_to_assign = avail_assigns.loc[row_idx_to_assign].assigned_model_id - gid_to_assign = avail_assigns.loc[row_idx_to_assign].assigned_gauge_id - - _df.loc[_df[mid_col] == id, [asgn_mid_col, asgn_gid_col, reason_col]] = \ - [mid_to_assign, gid_to_assign, f'cluster-{c_num}-dist'] - - return _df diff --git a/saber/cluster.py b/saber/cluster.py index 400f9ae..fb002af 100755 --- a/saber/cluster.py +++ b/saber/cluster.py @@ -65,6 +65,7 @@ def summarize(workdir: str) -> None: for model_file in natsorted(glob.glob(os.path.join(workdir, 'kmeans_outputs', 'kmeans-*.pickle'))): kmeans = joblib.load(model_file) n_clusters = int(kmeans.n_clusters) + print(n_clusters) # save the cluster centroids to table - columns are the cluster number, rows are the centroid FDC values pd.DataFrame( @@ -161,7 +162,7 @@ def plot_clusters(workdir: str, clusters: int or Iterable = 'all', for j in x[kmeans.labels_ == i]: ax.plot(j.ravel(), "k-") ax.plot(kmeans.cluster_centers_[i].flatten(), "r-") - # turn off plotting axes which are blank - made for the square grid but > n_clusters + # turn off plotting axes which are blank (when ax number > n_clusters) for ax in fig.axes[n_clusters:]: ax.axis('off') @@ -187,8 +188,7 @@ def plot_silhouette(workdir: str, plt_width: int = 3, plt_height: int = 3) -> No silhouette_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-silhouette_scores.parquet')) for tot_clusters in silhouette_df.columns: - centers_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{tot_clusters}-centers.parquet')) - # Create a subplot with 1 row and 2 columns + centers_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-centers-{tot_clusters}.parquet')) fig, (ax1, ax2) = plt.subplots( nrows=1, ncols=2, @@ -197,12 +197,14 @@ def plot_silhouette(workdir: str, plt_width: int = 3, plt_height: int = 3) -> No tight_layout=True, ) + # Plot 1 titles and labels ax1.set_title("Silhouette Plot") ax1.set_xlabel("Silhouette Score") ax1.set_ylabel("Cluster Label") ax1.set_yticks([]) # Clear the yaxis labels / ticks - # ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) + ax1.set_xticks([-0.2, 0, 0.2, 0.4, 0.6, 0.8, 1]) + # Plot 2 titles and labels ax2.set_title("Cluster Centers") ax2.set_xlabel("Exceedance Probability (%)") ax2.set_ylabel("Discharge Z-Score") @@ -240,33 +242,3 @@ def plot_silhouette(workdir: str, plt_width: int = 3, plt_height: int = 3) -> No fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'kmeans-silhouettes-{tot_clusters}.png')) plt.close(fig) return - - -def merge_assign_table(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) -> pd.DataFrame: - """ - Creates a csv listing the streams assigned to each cluster in workdir/kmeans_models and also adds that information - to assign_table.csv - - Args: - workdir: path to the project directory - assign_table: the assignment table DataFrame - n_clusters: number of clusters to use when applying the labels to the assign_table - - Returns: - None - """ - # todo move to assign module - if n_clusters is None: - # read the cluster results csv - with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'r') as f: - n_clusters = int(json.loads(f.read())['historical']) - - # create a dataframe with the optimal model's labels and the model_id's - df = pd.DataFrame({ - 'cluster': joblib.load( - os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_.flatten(), - mid_col: pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten() - }, dtype=str) - - # merge the dataframes - return assign_table.merge(df, how='outer', on=mid_col) diff --git a/saber/table.py b/saber/table.py index a1eaae5..9edaa33 100644 --- a/saber/table.py +++ b/saber/table.py @@ -1,14 +1,21 @@ import os +import joblib import numpy as np import pandas as pd +from ._propagation import propagate_in_table +from ._propagation import walk_downstream +from ._propagation import walk_upstream from ._vocab import asgn_gid_col from ._vocab import asgn_mid_col +from ._vocab import gid_col from ._vocab import mid_col +from ._vocab import order_col from ._vocab import reason_col -__all__ = ['get_path', 'read', 'cache', 'gen'] +__all__ = ['get_path', 'gen', 'read', 'cache', 'merge_clusters', 'assign_gauged', 'assign_propagation', + 'assign_by_distance', ] def get_path(workdir: str) -> str: @@ -78,3 +85,112 @@ def cache(workdir: str, assign_table: pd.DataFrame) -> None: assign_table.to_parquet(get_path(workdir)) return + +def merge_clusters(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) -> pd.DataFrame: + """ + Creates a csv listing the streams assigned to each cluster in workdir/kmeans_models and also adds that information + to assign_table.csv + + Args: + workdir: path to the project directory + assign_table: the assignment table DataFrame + n_clusters: number of clusters to use when applying the labels to the assign_table + + Returns: + None + """ + # create a dataframe with the optimal model's labels and the model_id's + df = pd.DataFrame({ + 'cluster': joblib.load(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_, + mid_col: pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten() + }, dtype=str) + + # merge the dataframes + return assign_table.merge(df, how='outer', on=mid_col) + + +def assign_gauged(df: pd.DataFrame) -> pd.DataFrame: + """ + Assigns basins a gauge for correction which contain a gauge + + Args: + df: the assignments table dataframe + + Returns: + Copy of df with assignments made + """ + _df = df.copy() + selector = ~_df[gid_col].isna() + _df.loc[selector, asgn_mid_col] = _df[mid_col] + _df.loc[selector, asgn_gid_col] = _df[gid_col] + _df.loc[selector, reason_col] = 'gauged' + return _df + + +def assign_propagation(df: pd.DataFrame, max_prop: int = 5) -> pd.DataFrame: + """ + + Args: + df: the assignments table dataframe + max_prop: the max number of stream segments to propagate downstream + + Returns: + Copy of df with assignments made + """ + _df = df.copy() + for gauged_stream in _df.loc[~_df[gid_col].isna(), mid_col]: + subset = _df.loc[_df[mid_col] == gauged_stream, gid_col] + if subset.empty: + continue + start_gid = subset.values[0] + connected_segments = walk_upstream(df, gauged_stream, same_order=True) + _df = propagate_in_table(_df, gauged_stream, start_gid, connected_segments, max_prop, 'upstream') + connected_segments = walk_downstream(df, gauged_stream, same_order=True) + _df = propagate_in_table(_df, gauged_stream, start_gid, connected_segments, max_prop, 'downstream') + + return _df + + +def assign_by_distance(df: pd.DataFrame) -> pd.DataFrame: + """ + Assigns all possible ungauged basins a gauge that is + (1) is closer than any other gauge + (2) is of same stream order as ungauged basin + (3) in the same simulated fdc cluster as ungauged basin + + Args: + df: the assignments table dataframe + + Returns: + Copy of df with assignments made + """ + _df = df.copy() + # first filter by cluster number + for c_num in sorted(set(_df['sim-fdc-cluster'].values)): + c_sub = _df[_df['sim-fdc-cluster'] == c_num] + # next filter by stream order + for so_num in sorted(set(c_sub[order_col])): + c_so_sub = c_sub[c_sub[order_col] == so_num] + + # determine which ids **need** to be assigned + ids_to_assign = c_so_sub[c_so_sub[asgn_mid_col].isna()][mid_col].values + avail_assigns = c_so_sub[c_so_sub[asgn_mid_col].notna()] + if ids_to_assign.size == 0 or avail_assigns.empty: + print(f'unable to assign cluster {c_num} to stream order {so_num}') + continue + # now you find the closest gauge to each unassigned + for id in ids_to_assign: + subset = c_so_sub.loc[c_so_sub[mid_col] == id, ['x', 'y']] + + dx = avail_assigns.x.values - subset.x.values + dy = avail_assigns.y.values - subset.y.values + avail_assigns['dist'] = np.sqrt(dx * dx + dy * dy) + row_idx_to_assign = avail_assigns['dist'].idxmin() + + mid_to_assign = avail_assigns.loc[row_idx_to_assign].assigned_model_id + gid_to_assign = avail_assigns.loc[row_idx_to_assign].assigned_gauge_id + + _df.loc[_df[mid_col] == id, [asgn_mid_col, asgn_gid_col, reason_col]] = \ + [mid_to_assign, gid_to_assign, f'cluster-{c_num}-dist'] + + return _df From c57a5fdb6d2ef314ccad24ec5b26efb169a3ddf2 Mon Sep 17 00:00:00 2001 From: rileyhales Date: Fri, 26 Aug 2022 00:09:40 -0600 Subject: [PATCH 17/24] add joblib to dependency lists --- conda-environment.yml | 1 + requirements.txt | 3 ++- saber/cluster.py | 5 ++--- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/conda-environment.yml b/conda-environment.yml index 5111979..725c053 100644 --- a/conda-environment.yml +++ b/conda-environment.yml @@ -17,3 +17,4 @@ dependencies: - scipy - natsort - scikit-learn + - joblib diff --git a/requirements.txt b/requirements.txt index 7d43255..67c3de9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,5 @@ matplotlib requests scipy natsort -scikit-learn \ No newline at end of file +scikit-learn +joblib \ No newline at end of file diff --git a/saber/cluster.py b/saber/cluster.py index fb002af..e93a1c6 100755 --- a/saber/cluster.py +++ b/saber/cluster.py @@ -6,8 +6,8 @@ import joblib import kneed -import matplotlib.pyplot as plt import matplotlib.cm as cm +import matplotlib.pyplot as plt import numpy as np import pandas as pd from natsort import natsorted @@ -15,9 +15,8 @@ from sklearn.metrics import silhouette_samples from ._vocab import cluster_count_file -from ._vocab import mid_col -__all__ = ['generate', 'summarize', 'plot_clusters', 'plot_silhouette', 'merge_assign_table'] +__all__ = ['generate', 'summarize', 'plot_clusters', 'plot_silhouette'] def generate(workdir: str, max_clusters: int = 12) -> None: From 5198270e1acb4a73e014bea728b9c834456456c2 Mon Sep 17 00:00:00 2001 From: rileyhales Date: Fri, 26 Aug 2022 00:11:03 -0600 Subject: [PATCH 18/24] drop analysis module with outdated plot function --- saber/__init__.py | 3 +- saber/analysis.py | 72 ----------------------------------------------- 2 files changed, 1 insertion(+), 74 deletions(-) delete mode 100644 saber/analysis.py diff --git a/saber/__init__.py b/saber/__init__.py index d05d5dc..1b0ea72 100644 --- a/saber/__init__.py +++ b/saber/__init__.py @@ -1,4 +1,3 @@ -import saber.analysis import saber.cluster import saber.gis import saber.prep @@ -8,7 +7,7 @@ from saber._calibrate import calibrate, calibrate_region __all__ = ['calibrate', 'calibrate_region', - 'table', 'prep', 'gis', 'cluster', 'utils', 'validate', 'analysis'] + 'table', 'prep', 'gis', 'cluster', 'utils', 'validate'] __author__ = 'Riley Hales' __version__ = '0.4.0' __license__ = 'BSD 3 Clause Clear' diff --git a/saber/analysis.py b/saber/analysis.py deleted file mode 100644 index 582fa3d..0000000 --- a/saber/analysis.py +++ /dev/null @@ -1,72 +0,0 @@ -import os - -import pandas as pd -import plotly.graph_objects as go - - -def plot(workdir, obs_data_dir, model_id) -> go.Figure: - files = [os.path.join(workdir, 'calibrated_simulated_flow.nc'), ] - variables = ('flow_sim', 'flow_bc') - dim_order = ('time', 'model_id') - a = grids.TimeSeries(files, variables, dim_order) - ts = a.point(None, model_id) - obs = pd.read_csv(os.path.join(obs_data_dir, )) - a = go.Figure( - [ - go.Scatter( - name='Original Simulation', - x=ts.index, - y=ts['flow_sim'] - ), - go.Scatter( - name='Adjusted Simulation', - x=ts.index, - y=ts['flow_bc'] - ), - go.Scatter( - name='Adjusted Simulation', - x=ts.index, - y=ts['flow_bc'] - ) - ] - ) - a.show() - return - - -def plot_results(sim, obs, bc, bcp, title): - sim_dates = sim.index.tolist() - scatters = [ - go.Scatter( - name='Propagated Corrected (Experimental)', - x=bcp.index.tolist(), - y=bcp['Propagated Corrected Streamflow'].values.flatten(), - ), - go.Scatter( - name='Bias Corrected (Jorges Method)', - x=sim_dates, - y=bc.values.flatten(), - ), - go.Scatter( - name='Simulated (ERA 5)', - x=sim_dates, - y=sim.values.flatten(), - ), - go.Scatter( - name='Observed', - x=obs.index.tolist(), - y=obs.values.flatten(), - ), - go.Scatter( - name='Percentile', - x=sim_dates, - y=bcp['Percentile'].values.flatten(), - ), - go.Scatter( - name='Scalar', - x=sim_dates, - y=bcp['Scalars'].values.flatten(), - ), - ] - a = go.Figure(scatters, layout={'title': title}) - return a From 1578cc726e1fd54c9ed819cdbc962583af483831 Mon Sep 17 00:00:00 2001 From: rileyhales Date: Sat, 27 Aug 2022 20:41:56 -0600 Subject: [PATCH 19/24] use the vocab module for table names, paths, formats --- saber/_calibrate.py | 5 ++-- saber/_vocab.py | 70 +++++++++++++++++++++++++++++---------------- saber/cluster.py | 35 +++++++++++------------ saber/gis.py | 12 ++++---- saber/prep.py | 35 +++++++++++------------ saber/table.py | 6 ++-- saber/validate.py | 18 ++++++------ 7 files changed, 100 insertions(+), 81 deletions(-) diff --git a/saber/_calibrate.py b/saber/_calibrate.py index 4b848dd..037a642 100644 --- a/saber/_calibrate.py +++ b/saber/_calibrate.py @@ -15,7 +15,7 @@ from ._vocab import asgn_gid_col from ._vocab import metric_list from ._vocab import metric_nc_name_list -from ._vocab import hindcast_table +from ._vocab import table_hindcast from ._vocab import cal_nc_name @@ -171,13 +171,14 @@ def calibrate_region(workdir: str, assign_table: pd.DataFrame, Returns: None """ + # todo create a parquet instead of a netcdf if gauge_table is None: gauge_table = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'gauge_table.csv')) if obs_data_dir is None: obs_data_dir = os.path.join(workdir, 'data_inputs', 'obs_csvs') bcs_nc_path = os.path.join(workdir, cal_nc_name) - ts = pd.read_pickle(os.path.join(workdir, 'data_processed', hindcast_table)) + ts = pd.read_pickle(os.path.join(workdir, 'data_processed', table_hindcast)) # create the new netcdf bcs_nc = nc.Dataset(bcs_nc_path, 'w') diff --git a/saber/_vocab.py b/saber/_vocab.py index 43d130b..4c293f4 100644 --- a/saber/_vocab.py +++ b/saber/_vocab.py @@ -1,8 +1,7 @@ import os -import glob + import pandas as pd -# todo enforce using this module for retrieving names?? # assign table and gis_input file required column names mid_col = 'model_id' @@ -16,41 +15,64 @@ # scaffolded folders tables = 'tables' -data_in = 'data_inputs' -gis_out = 'gis_outputs' -kmeans_out = 'kmeans_outputs' +gis_out = 'gis' +kmeans_out = 'clusters' # name of some files produced by the algorithm cluster_count_file = 'best-fit-cluster-count.json' cal_nc_name = 'calibrated_simulated_flow.nc' -hindcast_table = 'hindcast_series_table.parquet' -hindcast_fdc_table = 'hindcast_fdc_table.parquet' -drain_table = 'drain_table.parquet' -gauge_table = 'gauge_table.parquet' + +# name of the required input tables and the outputs +table_hindcast = 'hindcast.parquet' +table_hindcast_fdc = 'hindcast_fdc.parquet' +table_hindcast_fdc_trans = 'hindcast_fdc_transformed.parquet' +table_mids = 'mid_table.parquet' +table_drain = 'drain_table.parquet' +table_gauge = 'gauge_table.parquet' # metrics computed on validation sets metric_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE (2012)'] metric_nc_name_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE2012'] -def read_drain_table(workdir: str) -> pd.DataFrame: - return pd.read_parquet(os.path.join(workdir, tables, drain_table)) - - -def read_gauge_table(workdir: str) -> pd.DataFrame: - return pd.read_parquet(os.path.join(workdir, tables, gauge_table)) - - def get_table_path(workdir: str, table_name: str) -> str: - if table_name == 'hindcast_series': - return os.path.join(workdir, tables, hindcast_table) + if table_name == 'hindcast': + return os.path.join(workdir, tables, table_hindcast) elif table_name == 'hindcast_fdc': - return os.path.join(workdir, tables, hindcast_fdc_table) + return os.path.join(workdir, tables, table_hindcast_fdc) + elif table_name == 'hindcast_fdc_trans': + return os.path.join(workdir, tables, table_hindcast_fdc) + elif table_name == 'mid_list': + return os.path.join(workdir, tables, table_mids) elif table_name == 'drain_table': - return os.path.join(workdir, tables, drain_table) + return os.path.join(workdir, tables, table_drain) elif table_name == 'gauge_table': - return os.path.join(workdir, tables, gauge_table) + return os.path.join(workdir, tables, table_gauge) + else: + raise ValueError(f'Unknown table name: {table_name}') + + +def read_table(workdir: str, table_name: str) -> pd.DataFrame: + table_path = get_table_path(workdir, table_name) + table_format = os.path.splitext(table_path)[-1] + if table_format == 'parquet': + return pd.read_parquet(table_path) + elif table_format == 'feather': + return pd.read_feather(table_path) + elif table_format == 'csv': + return pd.read_csv(table_path) + else: + raise ValueError(f'Unknown table format: {table_format}') -def guess_hindcast_path(workdir: str) -> str: - return glob.glob(os.path.join(workdir, data_in, '*.nc*'))[0] +def write_table(table: pd.DataFrame, workdir: str, table_name: str) -> None: + table_path = get_table_path(workdir, table_name) + table_format = os.path.splitext(table_path)[-1] + if table_format == 'parquet': + table.to_parquet(table_path) + elif table_format == 'feather': + table.to_feather(table_path) + elif table_format == 'csv': + table.to_csv(table_path) + else: + raise ValueError(f'Unknown table format: {table_format}') diff --git a/saber/cluster.py b/saber/cluster.py index e93a1c6..b26318e 100755 --- a/saber/cluster.py +++ b/saber/cluster.py @@ -15,6 +15,7 @@ from sklearn.metrics import silhouette_samples from ._vocab import cluster_count_file +from ._vocab import read_table __all__ = ['generate', 'summarize', 'plot_clusters', 'plot_silhouette'] @@ -31,15 +32,14 @@ def generate(workdir: str, max_clusters: int = 12) -> None: None """ # read the prepared data (array x) - x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')) + x = read_table(workdir, 'hindcast_fdc_trans') x = x.values # build the kmeans model for a range of cluster numbers for n_clusters in range(2, max_clusters + 1): - print(n_clusters) kmeans = KMeans(n_clusters=n_clusters) kmeans.fit_predict(x) - joblib.dump(kmeans, os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')) + joblib.dump(kmeans, os.path.join(workdir, 'clusters', f'kmeans-{n_clusters}.pickle')) return @@ -58,10 +58,10 @@ def summarize(workdir: str) -> None: labels = [] # read the prepared data (array x) - x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')) + x = read_table(workdir, 'hindcast_fdc_trans') x = x.values - for model_file in natsorted(glob.glob(os.path.join(workdir, 'kmeans_outputs', 'kmeans-*.pickle'))): + for model_file in natsorted(glob.glob(os.path.join(workdir, 'clusters', 'kmeans-*.pickle'))): kmeans = joblib.load(model_file) n_clusters = int(kmeans.n_clusters) print(n_clusters) @@ -70,7 +70,7 @@ def summarize(workdir: str) -> None: pd.DataFrame( np.transpose(kmeans.cluster_centers_), columns=np.array(range(n_clusters)).astype(str) - ).to_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-centers-{n_clusters}.parquet')) + ).to_parquet(os.path.join(workdir, 'clusters', f'kmeans-centers-{n_clusters}.parquet')) # save the silhouette score for each cluster silhouette_scores.append(silhouette_samples(x, kmeans.labels_).flatten()) @@ -84,21 +84,21 @@ def summarize(workdir: str) -> None: # save the summary results as a csv pd.DataFrame.from_dict(summary) \ - .to_csv(os.path.join(workdir, 'kmeans_outputs', f'clustering-summary-stats.csv')) + .to_csv(os.path.join(workdir, 'clusters', f'clustering-summary-stats.csv')) # save the silhouette scores as a parquet silhouette_scores = np.transpose(np.array(silhouette_scores)) pd.DataFrame(silhouette_scores, columns=np.array(range(2, silhouette_scores.shape[1] + 2)).astype(str)) \ - .to_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-silhouette_scores.parquet')) + .to_parquet(os.path.join(workdir, 'clusters', 'kmeans-silhouette_scores.parquet')) # save the labels as a parquet labels = np.transpose(np.array(labels)) pd.DataFrame(labels, columns=np.array(range(2, labels.shape[1] + 2)).astype(str)) \ - .to_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-labels.parquet')) + .to_parquet(os.path.join(workdir, 'clusters', 'kmeans-labels.parquet')) # find the knee/elbow knee = kneed.KneeLocator(summary['number'], summary['inertia'], curve='convex', direction='decreasing').knee # save the best fitting cluster counts to a csv - with open(os.path.join(workdir, 'kmeans_outputs', cluster_count_file), 'w') as f: + with open(os.path.join(workdir, 'clusters', cluster_count_file), 'w') as f: f.write(json.dumps({'historical': int(knee)})) return @@ -118,12 +118,12 @@ def plot_clusters(workdir: str, clusters: int or Iterable = 'all', Returns: None """ - x = pd.read_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')).values + x = read_table(workdir, 'hindcast_fdc_trans').values size = x.shape[1] x_values = np.linspace(0, size, 5) x_ticks = np.linspace(0, 100, 5).astype(int) - kmeans_dir = os.path.join(workdir, 'kmeans_outputs') + kmeans_dir = os.path.join(workdir, 'clusters') if clusters == 'all': model_files = natsorted(glob.glob(os.path.join(kmeans_dir, 'kmeans-*.pickle'))) elif isinstance(clusters, int): @@ -134,7 +134,6 @@ def plot_clusters(workdir: str, clusters: int or Iterable = 'all', raise TypeError('n_clusters should be of type int or an iterable') for model_file in model_files: - # todo read the cluster centers from the parquet file instead of the model pickle kmeans = joblib.load(model_file) n_clusters = int(kmeans.n_clusters) n_cols = min(n_clusters, max_cols) @@ -165,7 +164,7 @@ def plot_clusters(workdir: str, clusters: int or Iterable = 'all', for ax in fig.axes[n_clusters:]: ax.axis('off') - fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'kmeans-clusters-{n_clusters}.png')) + fig.savefig(os.path.join(workdir, 'clusters', f'kmeans-clusters-{n_clusters}.png')) plt.close(fig) return @@ -183,11 +182,11 @@ def plot_silhouette(workdir: str, plt_width: int = 3, plt_height: int = 3) -> No Returns: None """ - labels_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-labels.parquet')) - silhouette_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', 'kmeans-silhouette_scores.parquet')) + labels_df = pd.read_parquet(os.path.join(workdir, 'clusters', 'kmeans-labels.parquet')) + silhouette_df = pd.read_parquet(os.path.join(workdir, 'clusters', 'kmeans-silhouette_scores.parquet')) for tot_clusters in silhouette_df.columns: - centers_df = pd.read_parquet(os.path.join(workdir, 'kmeans_outputs', f'kmeans-centers-{tot_clusters}.parquet')) + centers_df = pd.read_parquet(os.path.join(workdir, 'clusters', f'kmeans-centers-{tot_clusters}.parquet')) fig, (ax1, ax2) = plt.subplots( nrows=1, ncols=2, @@ -238,6 +237,6 @@ def plot_silhouette(workdir: str, plt_width: int = 3, plt_height: int = 3) -> No # add some buffer before the next cluster y_lower = y_upper + 10 - fig.savefig(os.path.join(workdir, 'kmeans_outputs', f'kmeans-silhouettes-{tot_clusters}.png')) + fig.savefig(os.path.join(workdir, 'clusters', f'kmeans-silhouettes-{tot_clusters}.png')) plt.close(fig) return diff --git a/saber/gis.py b/saber/gis.py index 14d8cc3..e90fa85 100644 --- a/saber/gis.py +++ b/saber/gis.py @@ -52,7 +52,7 @@ def clip_by_assignment(workdir: str, assign_table: pd.DataFrame, drain_gis: str, """ # read the drainage line shapefile dl = gpd.read_file(drain_gis) - save_dir = os.path.join(workdir, 'gis_outputs') + save_dir = os.path.join(workdir, 'gis') # get the unique list of assignment reasons for reason in set(assign_table[reason_col].dropna().values): @@ -84,7 +84,7 @@ def clip_by_cluster(workdir: str, assign_table: pd.DataFrame, drain_gis: str, pr gdf = dl_gdf[dl_gdf[mid_col].isin(assign_table[assign_table[mid_col] == num][mid_col])] if gdf.empty: continue - gdf.to_file(os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}-{int(num)}.gpkg')) + gdf.to_file(os.path.join(workdir, 'gis', f'{prefix}{"_" if prefix else ""}-{int(num)}.gpkg')) return @@ -107,7 +107,7 @@ def clip_by_unassigned(workdir: str, assign_table: pd.DataFrame, drain_gis: str, if subset.empty: warnings.warn('Empty filter: No streams are unassigned') return - savepath = os.path.join(workdir, 'gis_outputs', f'{prefix}{"_" if prefix else ""}assignments_unassigned.gpkg') + savepath = os.path.join(workdir, 'gis', f'{prefix}{"_" if prefix else ""}assignments_unassigned.gpkg') subset.to_file(savepath) return @@ -127,7 +127,7 @@ def clip_by_ids(workdir: str, ids: list, drain_gis: str, prefix: str = '', id_co None """ dl = gpd.read_file(drain_gis) - save_dir = os.path.join(workdir, 'gis_outputs') + save_dir = os.path.join(workdir, 'gis') name = f'{prefix}{"_" if prefix else ""}id_subset.gpkg' dl[dl[id_column].isin(ids)].to_file(os.path.join(save_dir, name)) return @@ -151,7 +151,7 @@ def validation_maps(workdir: str, gauge_gis: str, val_table: pd.DataFrame = None """ if val_table is None: val_table = pd.read_csv(os.path.join(workdir, 'validation_runs', 'val_table.csv')) - save_dir = os.path.join(workdir, 'gis_outputs') + save_dir = os.path.join(workdir, 'gis') # merge gauge table with the validation table gdf = gpd.read_file(gauge_gis) @@ -231,5 +231,5 @@ def histomaps(gdf: gpd.GeoDataFrame, metric: str, prct: str, workdir: str) -> No gdf[core_columns + [metric, ]].to_crs(epsg=3857).plot(metric) cx.add_basemap(ax=axm, zoom=9, source=cx.providers.Esri.WorldTopoMap, attribution='') - fig.savefig(os.path.join(workdir, 'gis_outputs', f'{metric}_{prct}.png')) + fig.savefig(os.path.join(workdir, 'gis', f'{metric}_{prct}.png')) return diff --git a/saber/prep.py b/saber/prep.py index c0af844..01e36f2 100755 --- a/saber/prep.py +++ b/saber/prep.py @@ -6,12 +6,11 @@ import pandas as pd from sklearn.preprocessing import StandardScaler as Scalar -from ._vocab import get_table_path -from ._vocab import guess_hindcast_path from ._vocab import mid_col -from ._vocab import read_drain_table +from ._vocab import read_table +from ._vocab import write_table -__all__ = ['gis_tables', 'hindcast', 'workdir'] +__all__ = ['gis_tables', 'hindcast', 'scaffold'] def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> None: @@ -27,11 +26,12 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No None """ if gauge_gis is not None: - if drain_gis.endswith('.parquet'): + if gauge_gis.endswith('.parquet'): gdf = gpd.read_parquet(gauge_gis) else: gdf = gpd.read_file(gauge_gis) - pd.DataFrame(gdf.drop('geometry', axis=1)).to_parquet(os.path.join(workdir, 'tables', 'gauge_table.parquet')) + write_table(pd.DataFrame(gdf.drop('geometry', axis=1)), workdir, 'gauge_table') + if drain_gis is not None: if drain_gis.endswith('.parquet'): gdf = gpd.read_parquet(drain_gis) @@ -40,11 +40,11 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No gdf['centroid_x'] = gdf.geometry.centroid.x gdf['centroid_y'] = gdf.geometry.centroid.y gdf = gdf.drop('geometry', axis=1) - pd.DataFrame(gdf).to_parquet(os.path.join(workdir, 'tables', 'drain_table.parquet')) + write_table(pd.DataFrame(gdf), workdir, 'drain_table') return -def hindcast(workdir: str, hind_nc_path: str = None, ) -> None: +def hindcast(workdir: str, hind_nc_path: str) -> None: """ Creates hindcast_series_table.parquet.gzip and hindcast_fdc_table.parquet.gzip in the workdir/tables directory for the GEOGloWS hindcast data @@ -56,11 +56,8 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None: Returns: None """ - if hind_nc_path is None: - hind_nc_path = guess_hindcast_path(workdir) - # read the assignments table - drain_table = read_drain_table(workdir) + drain_table = read_table(workdir, 'drain_table') model_ids = list(set(sorted(drain_table[mid_col].tolist()))) # read the hindcast netcdf, convert to dataframe, store as parquet @@ -70,7 +67,7 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None: ids = ids[ids_selector].astype(str).values.flatten() # save the model ids to table for reference - pd.DataFrame(ids, columns=['model_id', ]).to_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')) + write_table(pd.DataFrame(ids, columns=['model_id', ]), workdir, 'model_ids') # save the hindcast series to parquet df = pd.DataFrame( @@ -80,24 +77,24 @@ def hindcast(workdir: str, hind_nc_path: str = None, ) -> None: ) df = df[df.index.year >= 1980] df.index.name = 'datetime' - df.to_parquet(get_table_path(workdir, 'hindcast_series')) + write_table(df, workdir, 'hindcast_series') # calculate the FDC and save to parquet exceed_prob = np.linspace(0, 100, 101)[::-1] df = df.apply(lambda x: np.transpose(np.nanpercentile(x, exceed_prob))) df.index = exceed_prob df.index.name = 'exceed_prob' - df.to_parquet(get_table_path(workdir, 'hindcast_fdc')) + write_table(df, workdir, 'hindcast_fdc') # transform and prepare for clustering df = pd.DataFrame(np.transpose(Scalar().fit_transform(np.squeeze(df.values)))) df.index = ids df.columns = df.columns.astype(str) - df.to_parquet(os.path.join(workdir, 'tables', 'hindcast_fdc_transformed.parquet')) + write_table(df, workdir, 'hindcast_fdc_trans') return -def workdir(path: str, include_validation: bool = True) -> None: +def scaffold(path: str, include_validation: bool = True) -> None: """ Creates the correct directories for a Saber project within the specified directory @@ -108,9 +105,9 @@ def workdir(path: str, include_validation: bool = True) -> None: Returns: None """ - dir_list = ['tables', 'data_inputs', 'gis_outputs', 'kmeans_outputs'] + dir_list = ['tables', 'gis', 'clusters'] if include_validation: - dir_list.append('validation_runs') + dir_list.append('validation') for d in dir_list: p = os.path.join(path, d) if not os.path.exists(p): diff --git a/saber/table.py b/saber/table.py index 9edaa33..c8f97cb 100644 --- a/saber/table.py +++ b/saber/table.py @@ -72,8 +72,8 @@ def read(workdir: str) -> pd.DataFrame: def cache(workdir: str, assign_table: pd.DataFrame) -> None: """ - Saves the pandas dataframe to a parquet in the proper place in the project directory - A shortcut for pd.DataFrame.to_parquet so you don't have to code the paths every where + Saves the assignment table dataframe to parquet in the proper place in the project directory so that + you don't have to code the path every time Args: workdir: the project directory path @@ -101,7 +101,7 @@ def merge_clusters(workdir: str, assign_table: pd.DataFrame, n_clusters: int = N """ # create a dataframe with the optimal model's labels and the model_id's df = pd.DataFrame({ - 'cluster': joblib.load(os.path.join(workdir, 'kmeans_outputs', f'kmeans-{n_clusters}.pickle')).labels_, + 'cluster': joblib.load(os.path.join(workdir, 'clusters', f'kmeans-{n_clusters}.pickle')).labels_, mid_col: pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten() }, dtype=str) diff --git a/saber/validate.py b/saber/validate.py index 1de3c9b..24095b6 100644 --- a/saber/validate.py +++ b/saber/validate.py @@ -25,13 +25,13 @@ def sample_gauges(workdir: str, overwrite: bool = False) -> None: Returns: None """ - vr_path = os.path.join(workdir, 'validation_runs') + vr_path = os.path.join(workdir, 'validation') if overwrite: if os.path.exists(vr_path): shutil.rmtree(vr_path) os.mkdir(vr_path) - gt = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'gauge_table.csv')) + gt = pd.read_csv(os.path.join(workdir, 'gis', 'gauge_table.csv')) initial_row_count = gt.shape[0] rows_to_drop = round(gt.shape[0] / 10) @@ -54,9 +54,9 @@ def sample_gauges(workdir: str, overwrite: bool = False) -> None: # sample the gauge table gt = gt.sample(n=n) - gt.to_csv(os.path.join(subpath, 'gis_inputs', 'gauge_table.csv'), index=False) - shutil.copyfile(os.path.join(workdir, 'gis_inputs', 'drain_table.csv'), - os.path.join(subpath, 'gis_inputs', 'drain_table.csv')) + gt.to_csv(os.path.join(subpath, 'gis', 'gauge_table.csv'), index=False) + shutil.copyfile(os.path.join(workdir, 'gis', 'drain_table.csv'), + os.path.join(subpath, 'gis', 'drain_table.csv')) # filter the copied processed data to only contain the gauges included in this filtered step processed_sim_data = glob.glob(os.path.join(subpath, 'data_processed', 'obs-*.csv')) @@ -79,7 +79,7 @@ def gen_val_table(workdir: str) -> pd.DataFrame: Returns: pandas.DataFrame """ - df = pd.read_csv(os.path.join(workdir, 'gis_inputs', 'gauge_table.csv')) + df = pd.read_csv(os.path.join(workdir, 'gis', 'gauge_table.csv')) df['100'] = 1 stats_df = {} @@ -92,11 +92,11 @@ def gen_val_table(workdir: str) -> pd.DataFrame: a.close() for d in sorted( - [a for a in glob.glob(os.path.join(workdir, 'validation_runs', '*')) if os.path.isdir(a)], + [a for a in glob.glob(os.path.join(workdir, 'validation', '*')) if os.path.isdir(a)], reverse=True ): val_percent = os.path.basename(d) - valset_gids = pd.read_csv(os.path.join(d, 'gis_inputs', 'gauge_table.csv'))[gid_col].values.tolist() + valset_gids = pd.read_csv(os.path.join(d, 'gis', 'gauge_table.csv'))[gid_col].values.tolist() # mark a column indicating the gauges included in the validation set df[val_percent] = 0 @@ -110,6 +110,6 @@ def gen_val_table(workdir: str) -> pd.DataFrame: # merge gauge_table with the stats, save and return df = df.merge(pd.DataFrame(stats_df), on=mid_col, how='inner') - df.to_csv(os.path.join(workdir, 'validation_runs', 'val_table.csv'), index=False) + df.to_csv(os.path.join(workdir, 'validation', 'val_table.csv'), index=False) return df From 0670441c55a563993c12bba8e2ed9d1d098831ad Mon Sep 17 00:00:00 2001 From: rileyhales Date: Sun, 28 Aug 2022 15:53:39 -0600 Subject: [PATCH 20/24] turn _vocab into io and merge scaffold functions --- saber/__init__.py | 13 ++++-- saber/_calibrate.py | 24 +++++------ saber/_propagation.py | 15 +++---- saber/{utils.py => _utils.py} | 0 saber/{table.py => assign.py} | 77 +++++++++-------------------------- saber/cluster.py | 5 +-- saber/gis.py | 8 ++-- saber/{_vocab.py => io.py} | 39 ++++++++++++++---- saber/prep.py | 35 +++------------- saber/validate.py | 8 ++-- 10 files changed, 96 insertions(+), 128 deletions(-) rename saber/{utils.py => _utils.py} (100%) rename saber/{table.py => assign.py} (72%) rename saber/{_vocab.py => io.py} (70%) diff --git a/saber/__init__.py b/saber/__init__.py index 1b0ea72..d9eeb2c 100644 --- a/saber/__init__.py +++ b/saber/__init__.py @@ -1,13 +1,18 @@ +import saber.assign import saber.cluster import saber.gis +import saber.io import saber.prep -import saber.table -import saber.utils import saber.validate from saber._calibrate import calibrate, calibrate_region -__all__ = ['calibrate', 'calibrate_region', - 'table', 'prep', 'gis', 'cluster', 'utils', 'validate'] +__all__ = [ + # individual functions + 'calibrate', 'calibrate_region', + # modules + 'io', 'prep', 'cluster', 'assign', 'gis', 'validate', +] + __author__ = 'Riley Hales' __version__ = '0.4.0' __license__ = 'BSD 3 Clause Clear' diff --git a/saber/_calibrate.py b/saber/_calibrate.py index 037a642..11a788a 100644 --- a/saber/_calibrate.py +++ b/saber/_calibrate.py @@ -9,14 +9,14 @@ import pandas as pd from scipy import interpolate, stats -from .utils import solve_gumbel1, compute_fdc, compute_scalar_fdc -from ._vocab import mid_col -from ._vocab import asgn_mid_col -from ._vocab import asgn_gid_col -from ._vocab import metric_list -from ._vocab import metric_nc_name_list -from ._vocab import table_hindcast -from ._vocab import cal_nc_name +from ._utils import solve_gumbel1, compute_fdc, compute_scalar_fdc +from .io import asgn_gid_col +from .io import asgn_mid_col +from .io import cal_nc_name +from .io import metric_list +from .io import metric_nc_name_list +from .io import mid_col +from .io import table_hindcast def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd.DataFrame = None, @@ -61,9 +61,9 @@ def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd if fix_seasonally: # list of the unique months in the historical simulation. should always be 1->12 but just in case... monthly_results = [] - for month in sorted(set(sim_flow_a.index.strftime('%m'))): + for month in sorted(set(sim_flow_a.index.dt.strftime('%m'))): # filter data to current iteration's month - mon_obs_data = obs_flow_a[obs_flow_a.index.month == int(month)].dropna() + mon_obs_data = obs_flow_a[obs_flow_a.index.dt.month == int(month)].dropna() if mon_obs_data.empty: if empty_months == 'skip': @@ -71,8 +71,8 @@ def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd else: raise ValueError(f'Invalid value for argument "empty_months". Given: {empty_months}.') - mon_sim_data = sim_flow_a[sim_flow_a.index.month == int(month)].dropna() - mon_cor_data = sim_flow_b[sim_flow_b.index.month == int(month)].dropna() + mon_sim_data = sim_flow_a[sim_flow_a.index.dt.month == int(month)].dropna() + mon_cor_data = sim_flow_b[sim_flow_b.index.dt.month == int(month)].dropna() monthly_results.append(calibrate( mon_sim_data, mon_obs_data, mon_cor_data, fix_seasonally=False, empty_months=empty_months, diff --git a/saber/_propagation.py b/saber/_propagation.py index 5c558f5..b738a95 100644 --- a/saber/_propagation.py +++ b/saber/_propagation.py @@ -1,11 +1,11 @@ import pandas as pd -from ._vocab import mid_col -from ._vocab import down_mid_col -from ._vocab import order_col -from ._vocab import asgn_mid_col -from ._vocab import asgn_gid_col -from ._vocab import reason_col +from .io import asgn_gid_col +from .io import asgn_mid_col +from .io import down_mid_col +from .io import mid_col +from .io import order_col +from .io import reason_col def walk_downstream(df: pd.DataFrame, start_id: int, same_order: bool = True, outlet_next_id: str or int = -1) -> tuple: @@ -71,7 +71,8 @@ def walk_upstream(df: pd.DataFrame, start_id: int, same_order: bool = True) -> t return tuple(set(upstream_ids)) -def propagate_in_table(df: pd.DataFrame, start_mid: int, start_gid: int, connected: tuple, max_prop: int, direction: str): +def propagate_in_table(df: pd.DataFrame, start_mid: int, start_gid: int, connected: tuple, max_prop: int, + direction: str): """ Args: diff --git a/saber/utils.py b/saber/_utils.py similarity index 100% rename from saber/utils.py rename to saber/_utils.py diff --git a/saber/table.py b/saber/assign.py similarity index 72% rename from saber/table.py rename to saber/assign.py index c8f97cb..5017d26 100644 --- a/saber/table.py +++ b/saber/assign.py @@ -7,85 +7,48 @@ from ._propagation import propagate_in_table from ._propagation import walk_downstream from ._propagation import walk_upstream -from ._vocab import asgn_gid_col -from ._vocab import asgn_mid_col -from ._vocab import gid_col -from ._vocab import mid_col -from ._vocab import order_col -from ._vocab import reason_col +from .io import asgn_gid_col +from .io import asgn_mid_col +from .io import gid_col +from .io import mid_col +from .io import order_col +from .io import read_table +from .io import reason_col +from .io import write_table -__all__ = ['get_path', 'gen', 'read', 'cache', 'merge_clusters', 'assign_gauged', 'assign_propagation', - 'assign_by_distance', ] +__all__ = ['gen', 'merge_clusters', 'assign_gauged', 'assign_propagation', 'assign_by_distance', ] -def get_path(workdir: str) -> str: - """ - Creates the path to the assign table based on the working directory provided - - Args: - workdir: path to the working directory - - Returns: - absolute path of the assign_table - """ - return os.path.join(workdir, 'assign_table.parquet.gzip') - - -def gen(workdir) -> pd.DataFrame: +def gen(workdir: str, cache: bool = True) -> pd.DataFrame: """ Joins the drain_table.csv and gauge_table.csv to create the assign_table.csv Args: workdir: path to the working directory + cache: whether to cache the assign table immediately Returns: None """ # read and merge the tables - drain_df = pd.read_parquet(os.path.join(workdir, 'tables', 'drain_table.parquet')) - gauge_df = pd.read_parquet(os.path.join(workdir, 'tables', 'gauge_table.parquet')) - assign_table = pd.merge(drain_df, gauge_df, on=mid_col, how='outer') + assign_table = pd.merge( + read_table(workdir, 'drain_table'), + read_table(workdir, 'gauge_table'), + on=mid_col, + how='outer' + ) # create the new columns assign_table[asgn_mid_col] = np.nan assign_table[asgn_gid_col] = np.nan assign_table[reason_col] = np.nan - # cache the table - cache(workdir, assign_table) + if cache: + write_table(assign_table, workdir, 'assign_table') return assign_table -def read(workdir: str) -> pd.DataFrame: - """ - Reads the assign_table located in the provided directory - - Args: - workdir: path to the working directory - - Returns: - assign_table pandas.DataFrame - """ - return pd.read_parquet(get_path(workdir)) - - -def cache(workdir: str, assign_table: pd.DataFrame) -> None: - """ - Saves the assignment table dataframe to parquet in the proper place in the project directory so that - you don't have to code the path every time - - Args: - workdir: the project directory path - assign_table: the assign_table dataframe - - Returns: - None - """ - assign_table.to_parquet(get_path(workdir)) - return - - def merge_clusters(workdir: str, assign_table: pd.DataFrame, n_clusters: int = None) -> pd.DataFrame: """ Creates a csv listing the streams assigned to each cluster in workdir/kmeans_models and also adds that information @@ -102,7 +65,7 @@ def merge_clusters(workdir: str, assign_table: pd.DataFrame, n_clusters: int = N # create a dataframe with the optimal model's labels and the model_id's df = pd.DataFrame({ 'cluster': joblib.load(os.path.join(workdir, 'clusters', f'kmeans-{n_clusters}.pickle')).labels_, - mid_col: pd.read_parquet(os.path.join(workdir, 'tables', 'model_ids.parquet')).values.flatten() + mid_col: read_table(workdir, 'model_ids').values.flatten() }, dtype=str) # merge the dataframes diff --git a/saber/cluster.py b/saber/cluster.py index b26318e..81f8109 100755 --- a/saber/cluster.py +++ b/saber/cluster.py @@ -14,8 +14,8 @@ from sklearn.cluster import KMeans from sklearn.metrics import silhouette_samples -from ._vocab import cluster_count_file -from ._vocab import read_table +from .io import cluster_count_file +from .io import read_table __all__ = ['generate', 'summarize', 'plot_clusters', 'plot_silhouette'] @@ -64,7 +64,6 @@ def summarize(workdir: str) -> None: for model_file in natsorted(glob.glob(os.path.join(workdir, 'clusters', 'kmeans-*.pickle'))): kmeans = joblib.load(model_file) n_clusters = int(kmeans.n_clusters) - print(n_clusters) # save the cluster centroids to table - columns are the cluster number, rows are the centroid FDC values pd.DataFrame( diff --git a/saber/gis.py b/saber/gis.py index e90fa85..f8f6de9 100644 --- a/saber/gis.py +++ b/saber/gis.py @@ -8,10 +8,10 @@ import numpy as np import pandas as pd -from ._vocab import gid_col -from ._vocab import metric_nc_name_list -from ._vocab import mid_col -from ._vocab import reason_col +from .io import gid_col +from .io import metric_nc_name_list +from .io import mid_col +from .io import reason_col __all__ = ['generate_all', 'clip_by_assignment', 'clip_by_cluster', 'clip_by_unassigned', 'clip_by_ids', 'validation_maps'] diff --git a/saber/_vocab.py b/saber/io.py similarity index 70% rename from saber/_vocab.py rename to saber/io.py index 4c293f4..cc4a32d 100644 --- a/saber/_vocab.py +++ b/saber/io.py @@ -2,7 +2,6 @@ import pandas as pd - # assign table and gis_input file required column names mid_col = 'model_id' gid_col = 'gauge_id' @@ -29,12 +28,34 @@ table_mids = 'mid_table.parquet' table_drain = 'drain_table.parquet' table_gauge = 'gauge_table.parquet' +table_assign = 'assign_table.csv' # metrics computed on validation sets metric_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE (2012)'] metric_nc_name_list = ['ME', 'MAE', 'RMSE', 'MAPE', 'NSE', 'KGE2012'] +def scaffold_workdir(path: str, include_validation: bool = True) -> None: + """ + Creates the correct directories for a Saber project within the specified directory + + Args: + path: the path to a directory where you want to create workdir subdirectories + include_validation: boolean, indicates whether to create the validation folder + + Returns: + None + """ + dir_list = ['tables', 'gis', 'clusters'] + if include_validation: + dir_list.append('validation') + for d in dir_list: + p = os.path.join(path, d) + if not os.path.exists(p): + os.mkdir(p) + return + + def get_table_path(workdir: str, table_name: str) -> str: if table_name == 'hindcast': return os.path.join(workdir, tables, table_hindcast) @@ -42,12 +63,14 @@ def get_table_path(workdir: str, table_name: str) -> str: return os.path.join(workdir, tables, table_hindcast_fdc) elif table_name == 'hindcast_fdc_trans': return os.path.join(workdir, tables, table_hindcast_fdc) - elif table_name == 'mid_list': + elif table_name == 'model_ids': return os.path.join(workdir, tables, table_mids) elif table_name == 'drain_table': return os.path.join(workdir, tables, table_drain) elif table_name == 'gauge_table': return os.path.join(workdir, tables, table_gauge) + elif table_name == 'assign_table': + return os.path.join(workdir, tables, table_assign) else: raise ValueError(f'Unknown table name: {table_name}') @@ -55,11 +78,11 @@ def get_table_path(workdir: str, table_name: str) -> str: def read_table(workdir: str, table_name: str) -> pd.DataFrame: table_path = get_table_path(workdir, table_name) table_format = os.path.splitext(table_path)[-1] - if table_format == 'parquet': + if table_format == '.parquet': return pd.read_parquet(table_path) - elif table_format == 'feather': + elif table_format == '.feather': return pd.read_feather(table_path) - elif table_format == 'csv': + elif table_format == '.csv': return pd.read_csv(table_path) else: raise ValueError(f'Unknown table format: {table_format}') @@ -68,11 +91,11 @@ def read_table(workdir: str, table_name: str) -> pd.DataFrame: def write_table(table: pd.DataFrame, workdir: str, table_name: str) -> None: table_path = get_table_path(workdir, table_name) table_format = os.path.splitext(table_path)[-1] - if table_format == 'parquet': + if table_format == '.parquet': table.to_parquet(table_path) - elif table_format == 'feather': + elif table_format == '.feather': table.to_feather(table_path) - elif table_format == 'csv': + elif table_format == '.csv': table.to_csv(table_path) else: raise ValueError(f'Unknown table format: {table_format}') diff --git a/saber/prep.py b/saber/prep.py index 01e36f2..1cc1c2b 100755 --- a/saber/prep.py +++ b/saber/prep.py @@ -1,16 +1,14 @@ -import os - import geopandas as gpd import netCDF4 as nc import numpy as np import pandas as pd from sklearn.preprocessing import StandardScaler as Scalar -from ._vocab import mid_col -from ._vocab import read_table -from ._vocab import write_table +from .io import mid_col +from .io import read_table +from .io import write_table -__all__ = ['gis_tables', 'hindcast', 'scaffold'] +__all__ = ['gis_tables', 'hindcast'] def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> None: @@ -75,9 +73,9 @@ def hindcast(workdir: str, hind_nc_path: str) -> None: columns=ids, index=pd.to_datetime(hnc.variables['time'][:], unit='s') ) - df = df[df.index.year >= 1980] + df = df[df.index.dt.year >= 1980] df.index.name = 'datetime' - write_table(df, workdir, 'hindcast_series') + write_table(df, workdir, 'hindcast') # calculate the FDC and save to parquet exceed_prob = np.linspace(0, 100, 101)[::-1] @@ -92,24 +90,3 @@ def hindcast(workdir: str, hind_nc_path: str) -> None: df.columns = df.columns.astype(str) write_table(df, workdir, 'hindcast_fdc_trans') return - - -def scaffold(path: str, include_validation: bool = True) -> None: - """ - Creates the correct directories for a Saber project within the specified directory - - Args: - path: the path to a directory where you want to create workdir subdirectories - include_validation: boolean, indicates whether to create the validation folder - - Returns: - None - """ - dir_list = ['tables', 'gis', 'clusters'] - if include_validation: - dir_list.append('validation') - for d in dir_list: - p = os.path.join(path, d) - if not os.path.exists(p): - os.mkdir(p) - return diff --git a/saber/validate.py b/saber/validate.py index 24095b6..6690dd6 100644 --- a/saber/validate.py +++ b/saber/validate.py @@ -6,10 +6,10 @@ import numpy as np import pandas as pd -from ._vocab import cal_nc_name -from ._vocab import gid_col -from ._vocab import metric_nc_name_list -from ._vocab import mid_col +from .io import cal_nc_name +from .io import gid_col +from .io import metric_nc_name_list +from .io import mid_col def sample_gauges(workdir: str, overwrite: bool = False) -> None: From e20159d40042d973799fbbe9cd42193ac06ff1bf Mon Sep 17 00:00:00 2001 From: rileyhales Date: Mon, 29 Aug 2022 12:31:10 -0600 Subject: [PATCH 21/24] more compliance using the io module --- saber/_calibrate.py | 21 ++++++++++----------- saber/io.py | 39 +++++++++++++++++++++------------------ saber/prep.py | 18 +++++++++++------- saber/validate.py | 8 +++++--- 4 files changed, 47 insertions(+), 39 deletions(-) diff --git a/saber/_calibrate.py b/saber/_calibrate.py index 11a788a..597ce05 100644 --- a/saber/_calibrate.py +++ b/saber/_calibrate.py @@ -61,9 +61,9 @@ def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd if fix_seasonally: # list of the unique months in the historical simulation. should always be 1->12 but just in case... monthly_results = [] - for month in sorted(set(sim_flow_a.index.dt.strftime('%m'))): + for month in sorted(set(sim_flow_a.index.strftime('%m'))): # filter data to current iteration's month - mon_obs_data = obs_flow_a[obs_flow_a.index.dt.month == int(month)].dropna() + mon_obs_data = obs_flow_a[obs_flow_a.index.month == int(month)].dropna() if mon_obs_data.empty: if empty_months == 'skip': @@ -71,8 +71,8 @@ def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd else: raise ValueError(f'Invalid value for argument "empty_months". Given: {empty_months}.') - mon_sim_data = sim_flow_a[sim_flow_a.index.dt.month == int(month)].dropna() - mon_cor_data = sim_flow_b[sim_flow_b.index.dt.month == int(month)].dropna() + mon_sim_data = sim_flow_a[sim_flow_a.index.month == int(month)].dropna() + mon_cor_data = sim_flow_b[sim_flow_b.index.month == int(month)].dropna() monthly_results.append(calibrate( mon_sim_data, mon_obs_data, mon_cor_data, fix_seasonally=False, empty_months=empty_months, @@ -139,22 +139,21 @@ def _make_interpolator(x_input: np.array, y_output: np.array, extrap_method: str const: int or float = None) -> interpolate.interp1d: # make interpolator which converts the percentiles to scalars if extrap_method == 'nearest': - interpolator = interpolate.interp1d(x_input, y_output, fill_value='extrapolate', kind='nearest') + return interpolate.interp1d(x_input, y_output, fill_value='extrapolate', kind='nearest') elif extrap_method == 'const': if const is None: raise ValueError('Must provide the const kwarg when extrap_method="const"') - interpolator = interpolate.interp1d(x_input, y_output, fill_value=const, bounds_error=False) + return interpolate.interp1d(x_input, y_output, fill_value=const, bounds_error=False) elif extrap_method == 'linear': - interpolator = interpolate.interp1d(x_input, y_output, fill_value='extrapolate') + return interpolate.interp1d(x_input, y_output, fill_value='extrapolate') elif extrap_method == 'average': - interpolator = interpolate.interp1d(x_input, y_output, fill_value=np.mean(y_output), bounds_error=False) + return interpolate.interp1d(x_input, y_output, fill_value=np.mean(y_output), bounds_error=False) elif extrap_method == 'max' or extrap_method == 'maximum': - interpolator = interpolate.interp1d(x_input, y_output, fill_value=np.max(y_output), bounds_error=False) + return interpolate.interp1d(x_input, y_output, fill_value=np.max(y_output), bounds_error=False) elif extrap_method == 'min' or extrap_method == 'minimum': - interpolator = interpolate.interp1d(x_input, y_output, fill_value=np.min(y_output), bounds_error=False) + return interpolate.interp1d(x_input, y_output, fill_value=np.min(y_output), bounds_error=False) else: raise ValueError('Invalid extrapolation method provided') - return interpolator def calibrate_region(workdir: str, assign_table: pd.DataFrame, diff --git a/saber/io.py b/saber/io.py index cc4a32d..8436a2d 100644 --- a/saber/io.py +++ b/saber/io.py @@ -10,17 +10,18 @@ down_mid_col = 'downstream_model_id' reason_col = 'reason' area_col = 'drain_area' -order_col = 'stream_order' - -# scaffolded folders -tables = 'tables' -gis_out = 'gis' -kmeans_out = 'clusters' +order_col = 'strahler_order' # name of some files produced by the algorithm cluster_count_file = 'best-fit-cluster-count.json' cal_nc_name = 'calibrated_simulated_flow.nc' +# scaffolded folders +dir_tables = 'tables' +dir_gis = 'gis' +dir_clusters = 'clusters' +dir_valid = 'validation' + # name of the required input tables and the outputs table_hindcast = 'hindcast.parquet' table_hindcast_fdc = 'hindcast_fdc.parquet' @@ -46,9 +47,11 @@ def scaffold_workdir(path: str, include_validation: bool = True) -> None: Returns: None """ - dir_list = ['tables', 'gis', 'clusters'] + dir_list = [dir_tables, dir_gis, dir_clusters] + if not os.path.exists(path): + os.mkdir(path) if include_validation: - dir_list.append('validation') + dir_list.append(dir_valid) for d in dir_list: p = os.path.join(path, d) if not os.path.exists(p): @@ -58,19 +61,19 @@ def scaffold_workdir(path: str, include_validation: bool = True) -> None: def get_table_path(workdir: str, table_name: str) -> str: if table_name == 'hindcast': - return os.path.join(workdir, tables, table_hindcast) + return os.path.join(workdir, dir_tables, table_hindcast) elif table_name == 'hindcast_fdc': - return os.path.join(workdir, tables, table_hindcast_fdc) + return os.path.join(workdir, dir_tables, table_hindcast_fdc) elif table_name == 'hindcast_fdc_trans': - return os.path.join(workdir, tables, table_hindcast_fdc) + return os.path.join(workdir, dir_tables, table_hindcast_fdc) elif table_name == 'model_ids': - return os.path.join(workdir, tables, table_mids) + return os.path.join(workdir, dir_tables, table_mids) elif table_name == 'drain_table': - return os.path.join(workdir, tables, table_drain) + return os.path.join(workdir, dir_tables, table_drain) elif table_name == 'gauge_table': - return os.path.join(workdir, tables, table_gauge) + return os.path.join(workdir, dir_tables, table_gauge) elif table_name == 'assign_table': - return os.path.join(workdir, tables, table_assign) + return os.path.join(workdir, dir_tables, table_assign) else: raise ValueError(f'Unknown table name: {table_name}') @@ -92,10 +95,10 @@ def write_table(table: pd.DataFrame, workdir: str, table_name: str) -> None: table_path = get_table_path(workdir, table_name) table_format = os.path.splitext(table_path)[-1] if table_format == '.parquet': - table.to_parquet(table_path) + return table.to_parquet(table_path) elif table_format == '.feather': - table.to_feather(table_path) + return table.to_feather(table_path) elif table_format == '.csv': - table.to_csv(table_path) + return table.to_csv(table_path) else: raise ValueError(f'Unknown table format: {table_format}') diff --git a/saber/prep.py b/saber/prep.py index 1cc1c2b..753ba17 100755 --- a/saber/prep.py +++ b/saber/prep.py @@ -5,6 +5,7 @@ from sklearn.preprocessing import StandardScaler as Scalar from .io import mid_col +from .io import order_col from .io import read_table from .io import write_table @@ -42,21 +43,24 @@ def gis_tables(workdir: str, gauge_gis: str = None, drain_gis: str = None) -> No return -def hindcast(workdir: str, hind_nc_path: str) -> None: +def hindcast(workdir: str, hind_nc_path: str, drop_order_1: bool = False) -> None: """ - Creates hindcast_series_table.parquet.gzip and hindcast_fdc_table.parquet.gzip in the workdir/tables directory + Creates hindcast_series_table.parquet and hindcast_fdc_table.parquet in the workdir/tables directory for the GEOGloWS hindcast data Args: workdir: path to the working directory for the project - hind_nc_path: path to the hindcast or historical simulation netcdf if not located at workdir/data_simulated/*.nc + hind_nc_path: path to the hindcast or historical simulation netcdf + drop_order_1: whether to drop the order 1 streams from the hindcast Returns: None """ # read the assignments table - drain_table = read_table(workdir, 'drain_table') - model_ids = list(set(sorted(drain_table[mid_col].tolist()))) + model_ids = read_table(workdir, 'drain_table') + if drop_order_1: + model_ids = model_ids[model_ids[order_col] > 1] + model_ids = list(set(sorted(model_ids[mid_col].tolist()))) # read the hindcast netcdf, convert to dataframe, store as parquet hnc = nc.Dataset(hind_nc_path) @@ -65,7 +69,7 @@ def hindcast(workdir: str, hind_nc_path: str) -> None: ids = ids[ids_selector].astype(str).values.flatten() # save the model ids to table for reference - write_table(pd.DataFrame(ids, columns=['model_id', ]), workdir, 'model_ids') + write_table(pd.DataFrame(ids, columns=[mid_col, ]), workdir, 'model_ids') # save the hindcast series to parquet df = pd.DataFrame( @@ -73,7 +77,7 @@ def hindcast(workdir: str, hind_nc_path: str) -> None: columns=ids, index=pd.to_datetime(hnc.variables['time'][:], unit='s') ) - df = df[df.index.dt.year >= 1980] + df = df[df.index.year >= 1980] df.index.name = 'datetime' write_table(df, workdir, 'hindcast') diff --git a/saber/validate.py b/saber/validate.py index 6690dd6..f40a140 100644 --- a/saber/validate.py +++ b/saber/validate.py @@ -10,6 +10,8 @@ from .io import gid_col from .io import metric_nc_name_list from .io import mid_col +from .io import read_table +from .io import scaffold_workdir def sample_gauges(workdir: str, overwrite: bool = False) -> None: @@ -31,8 +33,9 @@ def sample_gauges(workdir: str, overwrite: bool = False) -> None: shutil.rmtree(vr_path) os.mkdir(vr_path) - gt = pd.read_csv(os.path.join(workdir, 'gis', 'gauge_table.csv')) + gt = read_table(workdir, 'gauge_table') initial_row_count = gt.shape[0] + rows_to_drop = round(gt.shape[0] / 10) for i in range(5): @@ -42,8 +45,7 @@ def sample_gauges(workdir: str, overwrite: bool = False) -> None: subpath = os.path.join(vr_path, f'{pct_remain}') # create the new project working directory - os.mkdir(subpath) - workdir(subpath, include_validation=False) + scaffold_workdir(subpath, include_validation=False) # overwrite the processed data directory so we don't need to redo this each time shutil.copytree( From 610044d54dd53782164ff4cae0741555aa2ccc35 Mon Sep 17 00:00:00 2001 From: rileyhales Date: Mon, 29 Aug 2022 15:14:41 -0600 Subject: [PATCH 22/24] organize and debug calibration function --- conda-environment.yml | 18 ++-- requirements.txt | 18 ++-- saber/_calibrate.py | 246 +++++++++++++++++++++++++++++++----------- saber/_utils.py | 57 ---------- 4 files changed, 202 insertions(+), 137 deletions(-) delete mode 100644 saber/_utils.py diff --git a/conda-environment.yml b/conda-environment.yml index 725c053..f7879c0 100644 --- a/conda-environment.yml +++ b/conda-environment.yml @@ -4,17 +4,17 @@ channels: - defaults dependencies: - python=3 - - geopandas - - pandas - - numpy - - numba - contextily - - netCDF4 - - pyarrow + - geopandas + - hydrostats + - joblib - kneed - matplotlib + - natsort + - netCDF4 + - numba + - numpy + - pandas + - pyarrow - requests - scipy - - natsort - - scikit-learn - - joblib diff --git a/requirements.txt b/requirements.txt index 67c3de9..1418fb2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,15 @@ python=3 -geopandas -pandas -numpy -numba contextily -netCDF4 -pyarrow +geopandas +hydrostats +joblib kneed matplotlib +natsort +netCDF4 +numba +numpy +pandas +pyarrow requests scipy -natsort -scikit-learn -joblib \ No newline at end of file diff --git a/saber/_calibrate.py b/saber/_calibrate.py index 597ce05..9a1aa08 100644 --- a/saber/_calibrate.py +++ b/saber/_calibrate.py @@ -9,7 +9,6 @@ import pandas as pd from scipy import interpolate, stats -from ._utils import solve_gumbel1, compute_fdc, compute_scalar_fdc from .io import asgn_gid_col from .io import asgn_mid_col from .io import cal_nc_name @@ -23,8 +22,8 @@ def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd fix_seasonally: bool = True, empty_months: str = 'skip', drop_outliers: bool = False, outlier_threshold: int or float = 2.5, filter_scalar_fdc: bool = False, filter_range: tuple = (0, 80), - extrapolate_method: str = 'nearest', fill_value: int or float = None, - fit_gumbel: bool = False, gumbel_range: tuple = (25, 75), ) -> pd.DataFrame: + extrapolate: str = 'nearest', fill_value: int or float = None, + fit_gumbel: bool = False, fit_range: tuple = (10, 90), ) -> pd.DataFrame: """ Removes the bias from simulated discharge using the SABER method. @@ -39,17 +38,22 @@ def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd sim_flow_b (pd.DataFrame): (optional) simulated hydrograph at point B to correct using scalar flow duration curve mapping and the bias relationship at point A. should contain a datetime index with daily values and a single column of discharge values. + fix_seasonally (bool): fix on a monthly (True) or annual (False) basis empty_months (str): how to handle months in the simulated data where no observed data are available. Options: "skip": ignore simulated data for months without + drop_outliers (bool): flag to exclude outliers outlier_threshold (int or float): number of std deviations from mean to exclude from flow duration curve - filter_scalar_fdc (bool): - filter_range (tuple): - extrapolate_method (str): - fill_value (int or float): - fit_gumbel (bool): - gumbel_range (tuple): + + filter_scalar_fdc (bool): flag to filter the scalar flow duration curve + filter_range (tuple): lower and upper bounds of the filter range + + extrapolate (str): method to use for extrapolation. Options: nearest, const, linear, average, max, min + fill_value (int or float): value to use for extrapolation when extrapolate_method='const' + + fit_gumbel (bool): flag to replace extremely low/high corrected flows with values from Gumbel type 1 + fit_range (tuple): lower and upper bounds of exceedance probabilities to replace with Gumbel values Returns: pd.DataFrame with a DateTime index and columns with corrected flow, uncorrected flow, the scalar adjustment @@ -78,84 +82,202 @@ def calibrate(sim_flow_a: pd.DataFrame, obs_flow_a: pd.DataFrame, sim_flow_b: pd fix_seasonally=False, empty_months=empty_months, drop_outliers=drop_outliers, outlier_threshold=outlier_threshold, filter_scalar_fdc=filter_scalar_fdc, filter_range=filter_range, - extrapolate_method=extrapolate_method, fill_value=fill_value, - fit_gumbel=fit_gumbel, gumbel_range=gumbel_range, ) + extrapolate=extrapolate, fill_value=fill_value, + fit_gumbel=fit_gumbel, fit_range=fit_range, ) ) # combine the results from each monthly into a single dataframe (sorted chronologically) and return it return pd.concat(monthly_results).sort_index() - # compute the fdc for paired sim/obs data and compute scalar fdc, either with or without outliers + # compute the flow duration curves if drop_outliers: - # drop outlier data before creating flow duration curves - # https://stackoverflow.com/questions/23199796/detect-and-exclude-outliers-in-pandas-data-frame - sim_fdc = compute_fdc( - sim_flow_a[(np.abs(stats.zscore(sim_flow_a)) < outlier_threshold).all(axis=1)]) - obs_fdc = compute_fdc( - obs_flow_a[(np.abs(stats.zscore(obs_flow_a)) < outlier_threshold).all(axis=1)]) + sim_fdc_a = calc_fdc(_drop_outliers_by_zscore(sim_flow_a, threshold=outlier_threshold), col_name='Q_sim') + sim_fdc_b = calc_fdc(_drop_outliers_by_zscore(sim_flow_b, threshold=outlier_threshold), col_name='Q_sim') + obs_fdc = calc_fdc(_drop_outliers_by_zscore(obs_flow_a, threshold=outlier_threshold), col_name='Q_obs') else: - sim_fdc = compute_fdc(sim_flow_a) - obs_fdc = compute_fdc(obs_flow_a) - - scalar_fdc = compute_scalar_fdc(sim_fdc['flow'].values.flatten(), obs_fdc['flow'].values.flatten()) + sim_fdc_a = calc_fdc(sim_flow_a, col_name='Q_sim') + sim_fdc_b = calc_fdc(sim_flow_b, col_name='Q_sim') + obs_fdc = calc_fdc(obs_flow_a, col_name='Q_obs') + # calculate the scalar flow duration curve (at point A with simulated and observed data) + scalar_fdc = calc_sfdc(sim_fdc_a['flow'].values.flatten(), obs_fdc['flow'].values.flatten()) if filter_scalar_fdc: scalar_fdc = scalar_fdc[scalar_fdc['p_exceed'].between(filter_range[0], filter_range[1])] - # determine the percentile of each uncorrected flow using the monthly fdc - values = sim_flow_b.values.flatten() - percentiles = [stats.percentileofscore(values, a) for a in values] - scalars = to_scalar(percentiles) - values = values / scalars + # make interpolators: Q_b -> p_exceed, p_exceed -> scalars_a + # flow at B converted to exceedance probabilities, then matched with the scalar computed at point A + flow_to_percent = _make_interpolator(sim_fdc_b.values, + sim_fdc_b.index, + extrap=extrapolate, + fill_value=fill_value) + + percent_to_scalar = _make_interpolator(scalar_fdc.index, + scalar_fdc.values, + extrap=extrapolate, + fill_value=fill_value) + + # apply interpolators to correct flows at B with data from A + qb_original = sim_flow_b.values.flatten() + p_exceed = flow_to_percent(qb_original) + scalars = percent_to_scalar(p_exceed) + qb_adjusted = qb_original / scalars if fit_gumbel: - tmp = pd.DataFrame(np.transpose([values, percentiles]), columns=('q', 'p')) + qb_adjusted = _fit_extreme_values_to_gumbel(qb_adjusted, p_exceed, fit_range) - # compute the average and standard deviation except for scaled data outside the percentile range specified - mid = tmp[tmp['p'] > gumbel_range[0]] - mid = mid[mid['p'] < gumbel_range[1]] - xbar = statistics.mean(mid['q'].tolist()) - std = statistics.stdev(mid['q'].tolist(), xbar) + return pd.DataFrame(data=np.transpose([qb_adjusted, qb_original, scalars, p_exceed]), + index=sim_flow_b.index.to_list(), + columns=('Q_adjusted', 'Q_original', 'scalars', 'p_exceed')) - q = [] - for p in tmp[tmp['p'] <= gumbel_range[0]]['p'].tolist(): - q.append(solve_gumbel1(std, xbar, 1 / (1 - (p / 100)))) - tmp.loc[tmp['p'] <= gumbel_range[0], 'q'] = q - q = [] - for p in tmp[tmp['p'] >= gumbel_range[1]]['p'].tolist(): - if p >= 100: - p = 99.999 - q.append(solve_gumbel1(std, xbar, 1 / (1 - (p / 100)))) - tmp.loc[tmp['p'] >= gumbel_range[1], 'q'] = q +def calc_fdc(flows: np.array, steps: int = 201, col_name: str = 'Q') -> pd.DataFrame: + """ + Compute flow duration curve (exceedance probabilities) from a list of flows - values = tmp['q'].values + Args: + flows: array of flows + steps: number of steps (exceedance probabilities) to use in the FDC + col_name: name of the column in the returned dataframe + + Returns: + pd.DataFrame with index 'p_exceed' and columns 'Q' (or col_name) + """ + # calculate the FDC and save to parquet + exceed_prob = np.linspace(100, 0, steps) + fdc_flows = np.nanpercentile(flows, exceed_prob) + df = pd.DataFrame(fdc_flows, columns=[col_name, ], index=exceed_prob) + df.index.name = 'p_exceed' + return df - return pd.DataFrame(data=np.transpose([values, sim_flow_b.values.flatten(), scalars, percentiles]), - index=sim_flow_b.index.to_list(), - columns=('flow', 'uncorrected', 'scalars', 'percentile')) +def calc_sfdc(sim_fdc: pd.DataFrame, obs_fdc: pd.DataFrame) -> pd.DataFrame: + """ + Compute the scalar flow duration curve (exceedance probabilities) from two flow duration curves + + Args: + sim_fdc: simulated flow duration curve + obs_fdc: observed flow duration curve + + Returns: + pd.DataFrame with index 'p_exceed' and columns 'Q' + """ + scalars_df = pd.DataFrame(np.divide(sim_fdc.values, obs_fdc.values), columns=['scalars'], index=sim_fdc.index) + scalars_df.replace(np.inf, np.nan, inplace=True) + scalars_df.dropna(inplace=True) + return scalars_df -def _make_interpolator(x_input: np.array, y_output: np.array, extrap_method: str = 'nearest', - const: int or float = None) -> interpolate.interp1d: + +def _drop_outliers_by_zscore(df: pd.DataFrame, threshold: float = 3) -> pd.DataFrame: + """ + Drop outliers from a dataframe by their z-score and a threshold + Based on https://stackoverflow.com/questions/23199796/detect-and-exclude-outliers-in-pandas-data-frame + Args: + df: dataframe to drop outliers from + threshold: z-score threshold + + Returns: + pd.DataFrame with outliers removed + """ + return df[(np.abs(stats.zscore(df)) < threshold).all(axis=1)] + + +def _filter_sfdc(sfdc: pd.DataFrame, filter_range: list) -> pd.DataFrame: + """ + Filter the scalar flow duration curve by the specified range + + Args: + sfdc: scalar flow duration curve DataFrame + filter_range: list of [lower_bound: int or float, upper_bound: int or float] + + Returns: + pd.DataFrame: filtered scalar flow duration curve + """ + return sfdc[np.logical_and(sfdc.index > filter_range[0], sfdc.index < filter_range[1])] + + +def _make_interpolator(x: np.array, y: np.array, extrap: str = 'nearest', + fill_value: int or float = None) -> interpolate.interp1d: + """ + Make an interpolator from two arrays + + Args: + x: x values + y: y values + extrap: method for extrapolation: nearest, const, linear, average, max, min + fill_value: value to use when extrap='const' + + Returns: + interpolate.interp1d + """ # make interpolator which converts the percentiles to scalars - if extrap_method == 'nearest': - return interpolate.interp1d(x_input, y_output, fill_value='extrapolate', kind='nearest') - elif extrap_method == 'const': - if const is None: + if extrap == 'nearest': + return interpolate.interp1d(x, y, fill_value='extrapolate', kind='nearest') + elif extrap == 'const': + if fill_value is None: raise ValueError('Must provide the const kwarg when extrap_method="const"') - return interpolate.interp1d(x_input, y_output, fill_value=const, bounds_error=False) - elif extrap_method == 'linear': - return interpolate.interp1d(x_input, y_output, fill_value='extrapolate') - elif extrap_method == 'average': - return interpolate.interp1d(x_input, y_output, fill_value=np.mean(y_output), bounds_error=False) - elif extrap_method == 'max' or extrap_method == 'maximum': - return interpolate.interp1d(x_input, y_output, fill_value=np.max(y_output), bounds_error=False) - elif extrap_method == 'min' or extrap_method == 'minimum': - return interpolate.interp1d(x_input, y_output, fill_value=np.min(y_output), bounds_error=False) + return interpolate.interp1d(x, y, fill_value=fill_value, bounds_error=False) + elif extrap == 'linear': + return interpolate.interp1d(x, y, fill_value='extrapolate') + elif extrap == 'average': + return interpolate.interp1d(x, y, fill_value=np.mean(y), bounds_error=False) + elif extrap == 'max' or extrap == 'maximum': + return interpolate.interp1d(x, y, fill_value=np.max(y), bounds_error=False) + elif extrap == 'min' or extrap == 'minimum': + return interpolate.interp1d(x, y, fill_value=np.min(y), bounds_error=False) else: raise ValueError('Invalid extrapolation method provided') +def _solve_gumbel1(std, xbar, rp): + """ + Solves the Gumbel Type I pdf = exp(-exp(-b)) + + Args: + std: standard deviation of the dataset + xbar: mean of the dataset + rp: return period to calculate in years + + Returns: + float: discharge value + """ + return -np.log(-np.log(1 - (1 / rp))) * std * .7797 + xbar - (.45 * std) + + +def _fit_extreme_values_to_gumbel(q_adjust: np.array, p_exceed: np.array, fit_range: tuple = None) -> np.array: + """ + Replace the extreme values from the corrected data with values based on the gumbel distribution + + Args: + q_adjust: adjusted flows to be refined + p_exceed: exceedance probabilities of the adjusted flows + fit_range: range of exceedance probabilities to fit to the Gumbel distribution + + Returns: + np.array of the flows with the extreme values replaced + """ + all_values = pd.DataFrame(np.transpose([q_adjust, p_exceed]), columns=('q', 'p')) + # compute the average and standard deviation for the values within the user specified fit_range + mid_vals = all_values.copy() + mid_vals = mid_vals[np.logical_and(mid_vals['p'] > fit_range[0], mid_vals['p'] < fit_range[1])] + xbar = statistics.mean(mid_vals['q'].values) + std = statistics.stdev(mid_vals['q'].values, xbar) + + # todo check that this is correct + q = [] + for p in mid_vals[mid_vals['p'] <= fit_range[0]]['p'].tolist(): + q.append(_solve_gumbel1(std, xbar, 1 / (1 - (p / 100)))) + mid_vals.loc[mid_vals['p'] <= fit_range[0], 'q'] = q + + q = [] + for p in mid_vals[mid_vals['p'] >= fit_range[1]]['p'].tolist(): + if p >= 100: + p = 99.999 + q.append(_solve_gumbel1(std, xbar, 1 / (1 - (p / 100)))) + mid_vals.loc[mid_vals['p'] >= fit_range[1], 'q'] = q + + qb_adjusted = mid_vals['q'].values + return qb_adjusted + + def calibrate_region(workdir: str, assign_table: pd.DataFrame, gauge_table: pd.DataFrame = None, obs_data_dir: str = None) -> None: """ diff --git a/saber/_utils.py b/saber/_utils.py deleted file mode 100644 index 3732dc1..0000000 --- a/saber/_utils.py +++ /dev/null @@ -1,57 +0,0 @@ -import math - -import hydrostats as hs -import hydrostats.data as hd -import numpy as np -import pandas as pd - - -def solve_gumbel1(std, xbar, rp): - """ - Solves the Gumbel Type I pdf = exp(-exp(-b)) - where b is the covariate - """ - # xbar = statistics.mean(year_max_flow_list) - # std = statistics.stdev(year_max_flow_list, xbar=xbar) - return -math.log(-math.log(1 - (1 / rp))) * std * .7797 + xbar - (.45 * std) - - -def statistics_tables(corrected: pd.DataFrame, simulated: pd.DataFrame, observed: pd.DataFrame) -> pd.DataFrame: - # merge the datasets together - merged_sim_obs = hd.merge_data(sim_df=simulated, obs_df=observed) - merged_cor_obs = hd.merge_data(sim_df=corrected, obs_df=observed) - - metrics = ['ME', 'RMSE', 'NRMSE (Mean)', 'MAPE', 'NSE', 'KGE (2009)', 'KGE (2012)'] - # Merge Data - table1 = hs.make_table(merged_dataframe=merged_sim_obs, metrics=metrics) - table2 = hs.make_table(merged_dataframe=merged_cor_obs, metrics=metrics) - - table2 = table2.rename(index={'Full Time Series': 'Corrected Full Time Series'}) - table1 = table1.rename(index={'Full Time Series': 'Original Full Time Series'}) - table1 = table1.transpose() - table2 = table2.transpose() - - return pd.merge(table1, table2, right_index=True, left_index=True) - - -def compute_fdc(flows: np.array, steps: int = 500, exceed: bool = True, col_name: str = 'flow'): - percentiles = [round((1 / steps) * i * 100, 5) for i in range(steps + 1)] - flows = np.nanpercentile(flows, percentiles) - if exceed: - percentiles.reverse() - return pd.DataFrame(flows, columns=[col_name, ], index=percentiles) - - -def compute_scalar_fdc(sim_fdc: pd.DataFrame, obs_fdc: pd.DataFrame): - sim_fdc = compute_fdc(sim_fdc) - obs_fdc = compute_fdc(obs_fdc) - ratios = np.divide(sim_fdc['flow'].values.flatten(), obs_fdc['flow'].values.flatten()) - columns = (sim_fdc.columns[0], 'p_exceed', 'scalars') - scalars_df = pd.DataFrame( - np.transpose([sim_fdc.values[:, 0], sim_fdc['p_exceed'].values.flatten(), ratios]), - columns=columns - ) - scalars_df.replace(np.inf, np.nan, inplace=True) - scalars_df.dropna(inplace=True) - - return scalars_df From d43f38d80f657a2615ef4129ac433b4daf389fd6 Mon Sep 17 00:00:00 2001 From: rileyhales Date: Mon, 29 Aug 2022 15:24:21 -0600 Subject: [PATCH 23/24] rename for pypi --- setup.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 6ef3807..0453132 100644 --- a/setup.py +++ b/setup.py @@ -6,12 +6,12 @@ with open('requirements.txt', 'r') as req: install_requires = req.read().splitlines() -description = 'tools for hydrological bias correction on large models' +description = 'tools for bias correction on large hydrological models' version = '0.4.0' setup( - name='saber', - packages=['saber'], + name='hydrosaber', + packages=['saber', ], version=version, description=description, long_description=long_description, From 79e9df99b2cd37ab06b02f492f37589139f3a40c Mon Sep 17 00:00:00 2001 From: rileyhales Date: Mon, 29 Aug 2022 15:26:51 -0600 Subject: [PATCH 24/24] remove python from requirements.txt --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1418fb2..16193a7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -python=3 contextily geopandas hydrostats