From 6f776eb3a50009ef63c950848c3c5543cb661c86 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Thu, 15 Feb 2024 10:36:53 -0700 Subject: [PATCH 1/4] added zero rate patch for precip data skill assessment --- sup3r/bias/bias_calc.py | 79 +++++++++++++++++++++++++++--- tests/bias/test_bias_correction.py | 22 ++++++++- 2 files changed, 92 insertions(+), 9 deletions(-) diff --git a/sup3r/bias/bias_calc.py b/sup3r/bias/bias_calc.py index 2ea899ac4..5ff8ad8c3 100644 --- a/sup3r/bias/bias_calc.py +++ b/sup3r/bias/bias_calc.py @@ -44,7 +44,8 @@ def __init__(self, bias_handler='DataHandlerNCforCC', base_handler_kwargs=None, bias_handler_kwargs=None, - decimals=None): + decimals=None, + match_zero_rate=False): """ Parameters ---------- @@ -94,6 +95,13 @@ class to be retrieved from the rex/sup3r library. If a decimals, this gets passed to np.around(). If decimals is negative, it specifies the number of positions to the left of the decimal point. + match_zero_rate : bool + Option to fix the frequency of zero values in the biased data. The + lowest percentile of values in the biased data will be set to zero + to match the percentile of zeros in the base data. This helps + resolve the issue where global climate models produce too many days + with small precipitation totals e.g., the "drizzle problem". + Ref: Polade et al., 2014 https://doi.org/10.1038/srep04364 """ logger.info('Initializing DataRetrievalBase for base dset "{}" ' @@ -110,6 +118,7 @@ class to be retrieved from the rex/sup3r library. If a self.bias_handler_kwargs = bias_handler_kwargs or {} self.bad_bias_gids = [] self._distance_upper_bound = distance_upper_bound + self.match_zero_rate = match_zero_rate if isinstance(self.base_fps, str): self.base_fps = sorted(glob(self.base_fps)) @@ -367,6 +376,7 @@ def get_bias_data(self, bias_gid): bias_data : np.ndarray 1D array of temporal data at the requested gid. """ + idx = np.where(self.bias_gid_raster == bias_gid) if self.bias_dh.data is None: self.bias_dh.load_cached_data() @@ -486,6 +496,46 @@ def get_base_data(cls, return out_data, out_ti + @staticmethod + def _match_zero_rate(bias_data, base_data): + """The lowest percentile of values in the biased data will be set to + zero to match the percentile of zeros in the base data. This helps + resolve the issue where global climate models produce too many days + with small precipitation totals e.g., the "drizzle problem". + Ref: Polade et al., 2014 https://doi.org/10.1038/srep04364 + + Parameters + ---------- + bias_data : np.ndarray + 1D array of biased data observations. + base_data : np.ndarray + 1D array of base data observations. + + Returns + ------- + bias_data : np.ndarray + 1D array of biased data observations. Values below the quantile + associated with zeros in base_data will be set to zero + """ + + q_zero_base_in = (base_data == 0).mean() + q_zero_bias_in = (bias_data == 0).mean() + + q_bias = np.linspace(0, 1, len(bias_data)) + min_value_bias = np.interp(q_zero_base_in, q_bias, sorted(bias_data)) + + bias_data[bias_data < min_value_bias] = 0 + + q_zero_base_out = (base_data == 0).mean() + q_zero_bias_out = (bias_data == 0).mean() + + logger.debug('Input bias/base zero rate is {:.3e}/{:.3e}, ' + 'output is {:.3e}/{:.3e}' + .format(q_zero_bias_in, q_zero_base_in, + q_zero_bias_out, q_zero_base_out)) + + return bias_data + @staticmethod def _read_base_sup3r_data(dh, base_dset, base_gid): """Read baseline data from a sup3r DataHandler @@ -727,7 +777,8 @@ def _run_single(cls, daily_reduction, bias_ti, decimals, - base_dh_inst=None): + base_dh_inst=None, + match_zero_rate=False): """Find the nominal scalar + adder combination to bias correct data at a single site""" @@ -739,6 +790,9 @@ def _run_single(cls, decimals=decimals, base_dh_inst=base_dh_inst) + if match_zero_rate: + cls._match_zero_rate(bias_data, base_data) + out = cls.get_linear_correction(bias_data, base_data, bias_feature, base_dset) return out @@ -931,6 +985,7 @@ def run(self, self.bias_ti, self.decimals, base_dh_inst=self.base_dh, + match_zero_rate=self.match_zero_rate, ) for key, arr in single_out.items(): self.out[key][raster_loc] = arr @@ -963,6 +1018,7 @@ def run(self, daily_reduction, self.bias_ti, self.decimals, + match_zero_rate=self.match_zero_rate, ) futures[future] = raster_loc @@ -1006,7 +1062,8 @@ def _run_single(cls, daily_reduction, bias_ti, decimals, - base_dh_inst=None): + base_dh_inst=None, + match_zero_rate=False): """Find the nominal scalar + adder combination to bias correct data at a single site""" @@ -1018,6 +1075,9 @@ def _run_single(cls, decimals=decimals, base_dh_inst=base_dh_inst) + if match_zero_rate: + cls._match_zero_rate(bias_data, base_data) + base_arr = np.full(cls.NT, np.nan, dtype=np.float32) out = {} @@ -1117,10 +1177,12 @@ def _init_out(self): f'bias_{self.bias_feature}_std', f'bias_{self.bias_feature}_skew', f'bias_{self.bias_feature}_kurtosis', + f'bias_{self.bias_feature}_zero_rate', f'base_{self.base_dset}_mean', f'base_{self.base_dset}_std', f'base_{self.base_dset}_skew', f'base_{self.base_dset}_kurtosis', + f'base_{self.base_dset}_zero_rate', ] self.out = {k: np.full((*self.bias_gid_raster.shape, self.NT), @@ -1154,11 +1216,13 @@ def _run_skill_eval(cls, bias_data, base_data, bias_feature, base_dset): out[f'bias_{bias_feature}_std'] = np.nanstd(bias_data) out[f'bias_{bias_feature}_skew'] = stats.skew(bias_data) out[f'bias_{bias_feature}_kurtosis'] = stats.kurtosis(bias_data) + out[f'bias_{bias_feature}_zero_rate'] = (bias_data == 0).mean() out[f'base_{base_dset}_mean'] = base_mean out[f'base_{base_dset}_std'] = np.nanstd(base_data) out[f'base_{base_dset}_skew'] = stats.skew(base_data) out[f'base_{base_dset}_kurtosis'] = stats.kurtosis(base_data) + out[f'base_{base_dset}_zero_rate'] = (base_data == 0).mean() ks_out = stats.ks_2samp(base_data - base_mean, bias_data - bias_mean) out[f'{bias_feature}_ks_stat'] = ks_out.statistic @@ -1175,7 +1239,7 @@ def _run_skill_eval(cls, bias_data, base_data, bias_feature, base_dset): @classmethod def _run_single(cls, bias_data, base_fps, bias_feature, base_dset, base_gid, base_handler, daily_reduction, bias_ti, - decimals, base_dh_inst=None): + decimals, base_dh_inst=None, match_zero_rate=False): """Do a skill assessment at a single site""" base_data, base_ti = cls.get_base_data(base_fps, base_dset, @@ -1184,13 +1248,14 @@ def _run_single(cls, bias_data, base_fps, bias_feature, base_dset, decimals=decimals, base_dh_inst=base_dh_inst) + if match_zero_rate: + cls._match_zero_rate(bias_data, base_data) + arr = np.full(cls.NT, np.nan, dtype=np.float32) out = {f'bias_{bias_feature}_mean_monthly': arr.copy(), f'bias_{bias_feature}_std_monthly': arr.copy(), f'base_{base_dset}_mean_monthly': arr.copy(), f'base_{base_dset}_std_monthly': arr.copy(), - f'{bias_feature}_scalar': arr.copy(), - f'{bias_feature}_adder': arr.copy(), } out.update(cls._run_skill_eval(bias_data, base_data, @@ -1208,6 +1273,6 @@ def _run_single(cls, bias_data, base_fps, bias_feature, base_dset, for k, v in mout.items(): if not k.endswith(('_scalar', '_adder')): k += '_monthly' - out[k][month - 1] = v + out[k][month - 1] = v return out diff --git a/tests/bias/test_bias_correction.py b/tests/bias/test_bias_correction.py index 997f955e9..bb4f46b8a 100644 --- a/tests/bias/test_bias_correction.py +++ b/tests/bias/test_bias_correction.py @@ -526,7 +526,25 @@ def test_nc_base_file(): out = calc.run(fill_extend=True, max_workers=1) - assert (out['rsds_scalar'] == 1).all() - assert (out['rsds_adder'] == 0).all() + assert np.allclose(out['base_rsds_mean_monthly'], + out['bias_rsds_mean_monthly']) assert np.allclose(out['base_rsds_mean'], out['bias_rsds_mean']) assert np.allclose(out['base_rsds_std'], out['bias_rsds_std']) + + +def test_match_zero_rate(): + """Test feature to match the rate of zeros in the bias data based on the + zero rate in the base data. Useful for precip where GCMs have a low-precip + "drizzle" problem.""" + bias_data = np.random.uniform(0, 1, 1000) + base_data = np.random.uniform(0, 1, 1000) + base_data[base_data < 0.1] = 0 + + skill = SkillAssessment._run_skill_eval(bias_data, base_data, 'f1', 'f1') + assert skill['bias_f1_zero_rate'] != skill['base_f1_zero_rate'] + assert (bias_data == 0).mean() != (base_data == 0).mean() + + bias_data = SkillAssessment._match_zero_rate(bias_data, base_data) + skill = SkillAssessment._run_skill_eval(bias_data, base_data, 'f1', 'f1') + assert (bias_data == 0).mean() == (base_data == 0).mean() + assert skill['bias_f1_zero_rate'] == skill['base_f1_zero_rate'] From a6813d6f147263bed6ac0b4f8928d740d3d78369 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Thu, 15 Feb 2024 15:44:49 -0700 Subject: [PATCH 2/4] better test for precip zero rate --- sup3r/bias/bias_calc.py | 6 +++--- tests/bias/test_bias_correction.py | 19 +++++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/sup3r/bias/bias_calc.py b/sup3r/bias/bias_calc.py index 5ff8ad8c3..446d6672c 100644 --- a/sup3r/bias/bias_calc.py +++ b/sup3r/bias/bias_calc.py @@ -791,7 +791,7 @@ def _run_single(cls, base_dh_inst=base_dh_inst) if match_zero_rate: - cls._match_zero_rate(bias_data, base_data) + bias_data = cls._match_zero_rate(bias_data, base_data) out = cls.get_linear_correction(bias_data, base_data, bias_feature, base_dset) @@ -1076,7 +1076,7 @@ def _run_single(cls, base_dh_inst=base_dh_inst) if match_zero_rate: - cls._match_zero_rate(bias_data, base_data) + bias_data = cls._match_zero_rate(bias_data, base_data) base_arr = np.full(cls.NT, np.nan, dtype=np.float32) out = {} @@ -1249,7 +1249,7 @@ def _run_single(cls, bias_data, base_fps, bias_feature, base_dset, base_dh_inst=base_dh_inst) if match_zero_rate: - cls._match_zero_rate(bias_data, base_data) + bias_data = cls._match_zero_rate(bias_data, base_data) arr = np.full(cls.NT, np.nan, dtype=np.float32) out = {f'bias_{bias_feature}_mean_monthly': arr.copy(), diff --git a/tests/bias/test_bias_correction.py b/tests/bias/test_bias_correction.py index bb4f46b8a..ea3d66854 100644 --- a/tests/bias/test_bias_correction.py +++ b/tests/bias/test_bias_correction.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- """pytests bias correction calculations""" import os +import shutil import tempfile import h5py @@ -548,3 +549,21 @@ def test_match_zero_rate(): skill = SkillAssessment._run_skill_eval(bias_data, base_data, 'f1', 'f1') assert (bias_data == 0).mean() == (base_data == 0).mean() assert skill['bias_f1_zero_rate'] == skill['base_f1_zero_rate'] + + with tempfile.TemporaryDirectory() as td: + fp_nsrdb_temp = os.path.join(td, os.path.basename(FP_NSRDB)) + shutil.copy(FP_NSRDB, fp_nsrdb_temp) + with h5py.File(fp_nsrdb_temp, 'a') as nsrdb_temp: + ghi = nsrdb_temp['ghi'][...] + ghi[:1000, :] = 0 + nsrdb_temp['ghi'][...] = ghi + calc = SkillAssessment(fp_nsrdb_temp, FP_CC, 'ghi', 'rsds', + target=TARGET, shape=SHAPE, + distance_upper_bound=0.7, + bias_handler='DataHandlerNCforCC', + match_zero_rate=True) + out = calc.run(fill_extend=True, max_workers=1) + + bias_rate = out['bias_rsds_zero_rate'] + base_rate = out['base_ghi_zero_rate'] + assert np.allclose(bias_rate, base_rate, rtol=0.005) From 03ef3b25e1db7cd9952a945438903cb9babebd53 Mon Sep 17 00:00:00 2001 From: grantbuster Date: Thu, 15 Feb 2024 16:41:09 -0700 Subject: [PATCH 3/4] fixed precip skill assessment - should not be mean centered when matching zero rate --- sup3r/bias/bias_calc.py | 39 ++++++++++++++++++++++++--------------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/sup3r/bias/bias_calc.py b/sup3r/bias/bias_calc.py index 446d6672c..144fefa33 100644 --- a/sup3r/bias/bias_calc.py +++ b/sup3r/bias/bias_calc.py @@ -98,9 +98,11 @@ class to be retrieved from the rex/sup3r library. If a match_zero_rate : bool Option to fix the frequency of zero values in the biased data. The lowest percentile of values in the biased data will be set to zero - to match the percentile of zeros in the base data. This helps - resolve the issue where global climate models produce too many days - with small precipitation totals e.g., the "drizzle problem". + to match the percentile of zeros in the base data. If + SkillAssessment is being run and this is True, the distributions + will not be mean-centered. This helps resolve the issue where + global climate models produce too many days with small + precipitation totals e.g., the "drizzle problem". Ref: Polade et al., 2014 https://doi.org/10.1038/srep04364 """ @@ -518,16 +520,16 @@ def _match_zero_rate(bias_data, base_data): associated with zeros in base_data will be set to zero """ - q_zero_base_in = (base_data == 0).mean() - q_zero_bias_in = (bias_data == 0).mean() + q_zero_base_in = np.nanmean(base_data == 0) + q_zero_bias_in = np.nanmean(bias_data == 0) q_bias = np.linspace(0, 1, len(bias_data)) min_value_bias = np.interp(q_zero_base_in, q_bias, sorted(bias_data)) bias_data[bias_data < min_value_bias] = 0 - q_zero_base_out = (base_data == 0).mean() - q_zero_bias_out = (bias_data == 0).mean() + q_zero_base_out = np.nanmean(base_data == 0) + q_zero_bias_out = np.nanmean(bias_data == 0) logger.debug('Input bias/base zero rate is {:.3e}/{:.3e}, ' 'output is {:.3e}/{:.3e}' @@ -1200,13 +1202,17 @@ def _init_out(self): self.out[bias_k] = arr.copy() @classmethod - def _run_skill_eval(cls, bias_data, base_data, bias_feature, base_dset): + def _run_skill_eval(cls, bias_data, base_data, bias_feature, base_dset, + match_zero_rate=False): """Run skill assessment metrics on 1D datasets at a single site. Note we run the KS test on the mean=0 distributions as per: S. Brands et al. 2013 https://doi.org/10.1007/s00382-013-1742-8 """ + if match_zero_rate: + bias_data = cls._match_zero_rate(bias_data, base_data) + out = {} bias_mean = np.nanmean(bias_data) base_mean = np.nanmean(base_data) @@ -1216,15 +1222,20 @@ def _run_skill_eval(cls, bias_data, base_data, bias_feature, base_dset): out[f'bias_{bias_feature}_std'] = np.nanstd(bias_data) out[f'bias_{bias_feature}_skew'] = stats.skew(bias_data) out[f'bias_{bias_feature}_kurtosis'] = stats.kurtosis(bias_data) - out[f'bias_{bias_feature}_zero_rate'] = (bias_data == 0).mean() + out[f'bias_{bias_feature}_zero_rate'] = np.nanmean(bias_data == 0) out[f'base_{base_dset}_mean'] = base_mean out[f'base_{base_dset}_std'] = np.nanstd(base_data) out[f'base_{base_dset}_skew'] = stats.skew(base_data) out[f'base_{base_dset}_kurtosis'] = stats.kurtosis(base_data) - out[f'base_{base_dset}_zero_rate'] = (base_data == 0).mean() + out[f'base_{base_dset}_zero_rate'] = np.nanmean(base_data == 0) + + if match_zero_rate: + ks_out = stats.ks_2samp(base_data, bias_data) + else: + ks_out = stats.ks_2samp(base_data - base_mean, + bias_data - bias_mean) - ks_out = stats.ks_2samp(base_data - base_mean, bias_data - bias_mean) out[f'{bias_feature}_ks_stat'] = ks_out.statistic out[f'{bias_feature}_ks_p'] = ks_out.pvalue @@ -1248,9 +1259,6 @@ def _run_single(cls, bias_data, base_fps, bias_feature, base_dset, decimals=decimals, base_dh_inst=base_dh_inst) - if match_zero_rate: - bias_data = cls._match_zero_rate(bias_data, base_data) - arr = np.full(cls.NT, np.nan, dtype=np.float32) out = {f'bias_{bias_feature}_mean_monthly': arr.copy(), f'bias_{bias_feature}_std_monthly': arr.copy(), @@ -1259,7 +1267,8 @@ def _run_single(cls, bias_data, base_fps, bias_feature, base_dset, } out.update(cls._run_skill_eval(bias_data, base_data, - bias_feature, base_dset)) + bias_feature, base_dset, + match_zero_rate=match_zero_rate)) for month in range(1, 13): bias_mask = bias_ti.month == month From 226b67fbbdfe276e96189fd54163c6956377872b Mon Sep 17 00:00:00 2001 From: grantbuster Date: Mon, 11 Mar 2024 15:35:19 -0600 Subject: [PATCH 4/4] one more sanity check --- tests/bias/test_bias_correction.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/bias/test_bias_correction.py b/tests/bias/test_bias_correction.py index ea3d66854..76eae6642 100644 --- a/tests/bias/test_bias_correction.py +++ b/tests/bias/test_bias_correction.py @@ -545,10 +545,13 @@ def test_match_zero_rate(): assert skill['bias_f1_zero_rate'] != skill['base_f1_zero_rate'] assert (bias_data == 0).mean() != (base_data == 0).mean() - bias_data = SkillAssessment._match_zero_rate(bias_data, base_data) - skill = SkillAssessment._run_skill_eval(bias_data, base_data, 'f1', 'f1') + skill = SkillAssessment._run_skill_eval(bias_data, base_data, 'f1', 'f1', + match_zero_rate=True) assert (bias_data == 0).mean() == (base_data == 0).mean() assert skill['bias_f1_zero_rate'] == skill['base_f1_zero_rate'] + for p in (1, 5, 25, 50, 75, 95, 99): + assert np.allclose(skill[f'base_f1_percentile_{p}'], + np.percentile(base_data, p)) with tempfile.TemporaryDirectory() as td: fp_nsrdb_temp = os.path.join(td, os.path.basename(FP_NSRDB))