diff --git a/pyleoclim/core/coherence.py b/pyleoclim/core/coherence.py index 71bd079d..fa41041b 100644 --- a/pyleoclim/core/coherence.py +++ b/pyleoclim/core/coherence.py @@ -692,17 +692,17 @@ def signif_test(self, number=200, method='ar1sim', seed=None, qs=[0.95], setting coh.signif_test(method='phaseran').plot() ''' - + from ..core.surrogateseries import SurrogateSeries + if number == 0: return self new = self.copy() - surr1 = self.timeseries1.surrogates( - number=number, seed=seed, method=method, settings=settings - ) - surr2 = self.timeseries2.surrogates( - number=number, seed=seed, method=method, settings=settings - ) + + surr1 = SurrogateSeries(method=method,number=number, seed=seed) + surr1.from_series(self.timeseries1) + surr2 = SurrogateSeries(method=method,number=number, seed=seed) + surr2.from_series(self.timeseries2) # adjust time axis @@ -730,11 +730,6 @@ def signif_test(self, number=200, method='ar1sim', seed=None, qs=[0.95], setting wtc_qs = np.ndarray(shape=(nq, nf, nt)) xwt_qs = np.empty_like(wtc_qs) - # for i in range(nf): - # for j in range(nt): - # wtc_qs[:,i,j] = mquantiles(wtcs[:,i,j], qs) - # xwt_qs[:,i,j] = mquantiles(xwts[:,i,j], qs) - # extract quantiles and reshape wtc_qs = mquantiles(wtcs_r, qs, axis=0) wtc_qs = np.reshape(wtc_qs, (nq, nf, nt)) diff --git a/pyleoclim/core/ensembleseries.py b/pyleoclim/core/ensembleseries.py index 107a2181..062e8ceb 100644 --- a/pyleoclim/core/ensembleseries.py +++ b/pyleoclim/core/ensembleseries.py @@ -22,8 +22,6 @@ import matplotlib.transforms as transforms import matplotlib as mpl from tqdm import tqdm -#import warnings -#from scipy.stats.mstats import mquantiles class EnsembleSeries(MultipleSeries): ''' EnsembleSeries object @@ -524,6 +522,8 @@ def plot_traces(self, figsize=[10, 4], xlabel=None, ylabel=None, title=None, num ''' savefig_settings = {} if savefig_settings is None else savefig_settings.copy() lgd_kwargs = {} if lgd_kwargs is None else lgd_kwargs.copy() + + num_traces = min(num_traces, len(self.series_list)) # restrict to the smaller of the two # generate default axis labels time_label, value_label = self.make_labels() @@ -972,7 +972,7 @@ def histplot(self, figsize=[10, 4], title=None, savefig_settings=None, ax=None, ylabel='KDE', vertical=False, edgecolor='w', **plot_kwargs): """ Plots the distribution of the timeseries across ensembles - Reuses seaborn [histplot](https://seaborn.pydata.org/generated/seaborn.histplot.html) function. + Reuses the seaborn [histplot](https://seaborn.pydata.org/generated/seaborn.histplot.html) function. Parameters ---------- diff --git a/pyleoclim/core/psds.py b/pyleoclim/core/psds.py index c572bedd..2dcf217b 100644 --- a/pyleoclim/core/psds.py +++ b/pyleoclim/core/psds.py @@ -1,10 +1,6 @@ from ..utils import plotting from ..utils import wavelet as waveutils from ..utils import spectral as specutils -#from ..utils import tsbase - -#from ..core.multiplepsd import * -#import ..core.multiplepsd import matplotlib.pyplot as plt @@ -16,28 +12,6 @@ from matplotlib.ticker import ScalarFormatter, FormatStrFormatter - -# def infer_period_unit_from_time_unit(time_unit): -# ''' infer a period unit based on the given time unit - -# ''' -# if time_unit is None: -# period_unit = None -# else: -# unit_group = lipdutils.timeUnitsCheck(time_unit) -# if unit_group != 'unknown': -# if unit_group == 'kage_units': -# period_unit = 'kyrs' -# else: -# period_unit = 'yrs' -# else: -# if time_unit[-1] == 's': -# period_unit = time_unit -# else: -# period_unit = f'{time_unit}s' - -# return period_unit - def infer_period_unit_from_time_unit(time_unit): ''' infer a period unit based on the given time unit @@ -273,6 +247,7 @@ def signif_test(self, method='ar1sim', number=None, seed=None, qs=[0.95], pyleoclim.core.series.Series.wavelet : Performs wavelet analysis on Pyleoclim Series ''' + from ..core.surrogateseries import SurrogateSeries if self.spec_method == 'wwz' and method == 'ar1asym': raise ValueError('Asymptotic solution is not supported for the wwz method') @@ -300,10 +275,9 @@ def signif_test(self, method='ar1sim', number=None, seed=None, qs=[0.95], return self new = self.copy() - surr = self.timeseries.surrogates( - number=number, seed=seed, method=method, settings=settings - ) - + surr = SurrogateSeries(method=method,number=number, seed=seed) + surr.from_series(self.timeseries) + if signif_scals: surr_psd = surr.spectral( method=self.spec_method, settings=self.spec_args, scalogram_list=signif_scals diff --git a/pyleoclim/core/scalograms.py b/pyleoclim/core/scalograms.py index 0661ad34..2ccd1305 100644 --- a/pyleoclim/core/scalograms.py +++ b/pyleoclim/core/scalograms.py @@ -482,6 +482,7 @@ def signif_test(self, method='ar1sim', number=None, seed=None, qs=[0.95], scalogram = ts.wavelet().signif_test(number=2, export_scal=True) ''' + if self.wave_method == 'wwz' and method == 'ar1asym': raise ValueError('Asymptotic solution is not supported for the wwz method') @@ -490,6 +491,7 @@ def signif_test(self, method='ar1sim', number=None, seed=None, qs=[0.95], raise ValueError(f"The available methods are: {acceptable_methods}") if method in ['ar1sim','uar1'] : + from ..core.surrogateseries import SurrogateSeries if hasattr(self,'signif_scals'): signif_scals = self.signif_scals @@ -517,14 +519,14 @@ def signif_test(self, method='ar1sim', number=None, seed=None, qs=[0.95], number -= len(scalogram_list) surr_scal_tmp = [] surr_scal_tmp.extend(scalogram_list) - surr = self.timeseries.surrogates(number=number, seed=seed, - method=method, settings=settings) + surr = SurrogateSeries(method=method,number=number, seed=seed) + surr.from_series(self.timeseries) surr_scal_tmp.extend(surr.wavelet(method=self.wave_method, settings=self.wave_args).scalogram_list) surr_scal = MultipleScalogram(scalogram_list=surr_scal_tmp) else: - surr = self.timeseries.surrogates(number=number, seed=seed, - method=method, settings=settings) + surr = SurrogateSeries(method=method,number=number, seed=seed) + surr.from_series(self.timeseries) surr_scal = surr.wavelet(method=self.wave_method, settings=self.wave_args) if type(qs) is not list: diff --git a/pyleoclim/core/series.py b/pyleoclim/core/series.py index 3ddd3953..b5cb9d5f 100644 --- a/pyleoclim/core/series.py +++ b/pyleoclim/core/series.py @@ -24,9 +24,7 @@ from ..core.scalograms import Scalogram from ..core.coherence import Coherence from ..core.corr import Corr -from ..core.surrogateseries import SurrogateSeries from ..core.resolutions import Resolution -from ..core.surrogateseries import supported_surrogates import seaborn as sns import matplotlib.pyplot as plt @@ -3351,9 +3349,10 @@ def wavelet_coherence(self, target_series, method='cwt', settings=None, return coh - def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = 'phaseran', timespan=None, - settings=None, common_time_kwargs=None, seed=None, mute_pbar=False): - ''' Estimates the correlation and its associated significance between two time series (not ncessarily IID). + def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = 'phaseran', + number=1000, timespan=None, settings=None, seed=None, + common_time_kwargs=None, mute_pbar=False): + ''' Estimates the correlation and its associated significance between two time series (not ncessarily IID) as per [1] The significance of the correlation is assessed using one of the following methods: @@ -3386,8 +3385,10 @@ def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = method : str, {'ttest','built-in','ar1sim','phaseran'} method for significance testing. Default is 'phaseran' - 'ttest' implements the T-test with degrees of freedom adjusted for autocorrelation, as done in [1] - 'built-in' uses the p-value that ships with the SciPy function. + * 'ttest' implements the T-test with degrees of freedom adjusted for autocorrelation, as done in [1] + * 'built-in' uses the p-value that ships with the SciPy function. + * 'ar1sim' (formerly 'isopersistent') tests against an ensemble of AR(1) seires fitted to the originals + * 'phaseran' (formerly 'isospectral') tests against phase-randomized surrogates (aka the method of Ebisuzaki [2]) The old options 'isopersistent' and 'isospectral' still work, but trigger a deprecation warning. Note that 'weightedtau' does not have a known distribution, so the 'built-in' method returns an error in that case. @@ -3395,15 +3396,8 @@ def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = The time interval over which to perform the calculation settings : dict - Parameters for the correlation function, including: - - nsim : int - the number of simulations (default: 1000) - method : str, {'ttest','ar1sim','phaseran' (default)} - method for significance testing - surr_settings : dict - Parameters for surrogate generator. See individual methods for details. - + optional parameters for the measures of association. See https://docs.scipy.org/doc/scipy/reference/stats.html#association-correlation-tests + common_time_kwargs : dict Parameters for the method `MultipleSeries.common_time()`. Will use interpolation by default. @@ -3443,11 +3437,15 @@ def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = References ---------- [1] Hu, J., J. Emile-Geay, and J. Partin (2017), Correlation-based interpretations of paleoclimate data – where statistics meet past climates, Earth and Planetary Science Letters, 459, 362–371, doi:10.1016/j.epsl.2016.11.048. - + + [2] Ebisuzaki, W. (1997), A method to estimate the statistical significance of a correla- tion when the data are serially correlated, Journal of Climate, 10(9), 2147–2153, doi:10.1175/1520- 0442(1997)010¡2147:AMTETS¿2.0.CO;2. + Examples -------- - Correlation between the Nino3.4 index and the Deasonalized All Indian Rainfall Index + Let us compute the correlation between the Nino3.4 index and the Deasonalized All Indian Rainfall Index. + For expendiency, we limit the Monte Carlo tests to 20 surrogates, which is not sufficient to obtain accurate p-values. + The default number, 1000, is more respectable, though to avoid p-hacking, we recommend bumping it even higher if at all possible. .. jupyter-execute:: @@ -3455,38 +3453,41 @@ def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = ts_air = pyleo.utils.load_dataset('AIR') ts_nino = pyleo.utils.load_dataset('NINO3') - # with `nsim=20` and default `method='phaseran'` - # set an arbitrary random seed to fix the result - corr_res = ts_nino.correlation(ts_air, settings={'nsim': 20}, seed=2333) - print(corr_res) - - # changing the statistic - corr_res = ts_nino.correlation(ts_air, statistic='kendalltau') + # with 20 surrogates and the default significance method, 'phaseran'` + corr_res = ts_nino.correlation(ts_air, number=20) print(corr_res) - # using a simple t-test with DOFs adjusted for autocorrelation - # set an arbitrary random seed to fix the result + # using a simple t-test with DOFs adjusted for autocorrelation (only with Person's r') corr_res = ts_nino.correlation(ts_air, method='ttest') print(corr_res) # using "isopersistent" surrogates (AR(1) simulation) - # set an arbitrary random seed to fix the result - corr_res = ts_nino.correlation(ts_air, method = 'ar1sim', settings={'nsim': 20}, seed=2333) + # and setting an arbitrary random seed to fix the result + corr_res = ts_nino.correlation(ts_air, method = 'ar1sim', number=20, seed=2333) print(corr_res) + + # changing the statistic works with all surrogate-based significance methods (but not the 'ttest' method, which only applies to a 'pearsonr' statistic) + corr_res = ts_nino.correlation(ts_air, statistic='spearmanr', method = 'phaseran', number=20) + print(corr_res) + + # To use the built-in signficance test: + corr_res2s = ts_nino.correlation(ts_air, statistic='spearmanr', method = 'built-in') + # To modify the method, use `settings`. For instance, to specify a 1-sided test instead of a 2-sided one for Spearman's R: + corr_res1s = ts_nino.correlation(ts_air, statistic='spearmanr', method = 'built-in', settings={'alternative':'less'}) + print(corr_res1s.p-corr_res2s.p) # shows the difference in p-values ''' + from ..core.surrogateseries import SurrogateSeries, supported_surrogates if method == 'isospectral': - warnings.warn("isospectral is deprecated and was replaced by 'phaseran'", + warnings.warn("isospectral is now 'phaseran'", DeprecationWarning, stacklevel=2) method = 'phaseran' elif method == 'isopersistent': - warnings.warn("isopersistent is deprecated and was replaced by 'ar1sim'", + warnings.warn("isopersistent is now 'ar1sim'", DeprecationWarning, stacklevel=2) method = 'ar1sim' settings = {} if settings is None else settings.copy() - corr_args = {'alpha': alpha, 'method': method} - corr_args.update(settings) ms = MultipleSeries([self, target_series]) if list(self.time) != list(target_series.time): @@ -3506,33 +3507,40 @@ def correlation(self, target_series, alpha=0.05, statistic='pearsonr', method = np.random.seed(seed) if method == 'ttest': - stat, signf, pval = corrutils.corr_ttest(ts0.value, ts1.value, alpha=alpha) - signif = bool(signf) + if statistic == 'pearsonr': + stat, signf, pval = corrutils.corr_ttest(ts0.value, ts1.value, alpha=alpha) + signif = bool(signf) + else: + raise ValueError(f"The adjusted T-test only applies to Pearson's r; got {statistic}") + elif method == 'built-in': if statistic == 'weightedtau': raise ValueError('The null distribution of this statistic is unknown; please use a non-parametric method to obtain the p-value') else: - res = corrutils.association(ts0.value,ts1.value,statistic) + res = corrutils.association(ts0.value,ts1.value,statistic, settings) stat = res[0] pval = res.pvalue if len(res) > 1 else np.nan signif = pval <= alpha - elif method in supported_surrogates: - number = corr_args['nsim'] if 'nsim' in corr_args.keys() else 1000 - seed = corr_args['seed'] if 'seed' in corr_args.keys() else None - #method = corr_args['method'] if 'method' in corr_args.keys() else None - surr_settings = corr_args['surr_settings'] if 'surr_settings' in corr_args.keys() else None - + elif method in supported_surrogates: + if 'nsim' in settings.keys(): # for legacy reasons + raise DeprecationWarning("The number of simulations is now governed by the parameter `number`. nsim will be removed in an upcoming release") + number = settings['nsim'] + settings.pop('nsim') + else: + number = number + # compute correlation statistic - stat = corrutils.association(ts0.value,ts1.value,statistic)[0] - - ts0_surr = ts0.surrogates(number=number, seed=seed, - method=method, settings=surr_settings) - ts1_surr = ts1.surrogates(number=number, seed=seed, - method=method, settings=surr_settings) + stat = corrutils.association(ts0.value,ts1.value,statistic,settings)[0] + + # establish significance against surrogates stat_surr = np.empty((number)) + ts0_surr = SurrogateSeries(method=method,number=number) + ts0_surr.from_series(ts0) + ts1_surr = SurrogateSeries(method=method,number=number) + ts1_surr.from_series(ts1) for i in tqdm(range(number), desc='Evaluating association on surrogate pairs', total=number, disable=mute_pbar): stat_surr[i] = corrutils.association(ts0_surr.series_list[i].value, - ts1_surr.series_list[i].value,statistic)[0] + ts1_surr.series_list[i].value,statistic, settings)[0] # obtain p-value pval = np.sum(np.abs(stat_surr) >= np.abs(stat)) / number # establish significance @@ -3642,135 +3650,119 @@ def causality(self, target_series, method='liang', timespan=None, settings=None, causal_res = spec_func[method](value1, value2, **args[method]) return causal_res - def surrogates(self, method='ar1sim', number=1, time_pattern='match', - length=None, seed=None, settings=None): - ''' Generate surrogates of the Series object according to "method" + # def surrogates(self, method='ar1sim', number=1, time_pattern='match', + # length=None, seed=None, settings=None): + # ''' Generate surrogates of the Series object according to "method" - For now, assumes uniform spacing and increasing time axis + # For now, assumes uniform spacing and increasing time axis - Parameters - ---------- - - method : {ar1sim, phaseran, uar1} - The method used to generate surrogates of the timeseries - - Note that phaseran assumes an odd number of samples. If the series - has even length, the last point is dropped to satisfy this requirement - - number : int - The number of surrogates to generate - - time_pattern : str {match, even, random} - The pattern used to generate the surrogate time axes - 'match' uses the same pattern as the original Series - 'even' uses an evenly-spaced time with spacing delta_t specified in settings (will return error if not specified) - 'random' uses random_time_index() with specified distribution and parameters (default: 'exponential' with parameter 1) - - length : int - Length of the series - - seed : int - Control seed option for reproducibility + # Parameters + # ---------- - settings : dict - Parameters for surrogate generator. See individual methods for details. + - Returns - ------- - surr : SurrogateSeries + # time_pattern : str {match, even, random} + # The pattern used to generate the surrogate time axes + # 'match' uses the same pattern as the original Series + # 'even' uses an evenly-spaced time with spacing delta_t specified in settings (will return error if not specified) + # 'random' uses random_time_index() with specified distribution and parameters (default: 'exponential' with parameter 1) - See also - -------- + # length : int + # Length of the series - pyleoclim.utils.tsmodel.ar1_sim : AR(1) simulator - pyleoclim.utils.tsmodel.uar1_sim : maximum likelihood AR(1) simulator - pyleoclim.utils.tsutils.phaseran2 : phase randomization - pyleoclim.utils.tsutils.random_time_index : random time index vector according to a specific probability model + # seed : int + # Control seed option for reproducibility - ''' - settings = {} if settings is None else settings.copy() + # settings : dict + # Parameters for surrogate generator. See individual methods for details. - if seed is not None: - np.random.seed(seed) + # Returns + # ------- + # surr : SurrogateSeries - if length is None: - n = len(self.value) - else: - n = length - - # generate time axes according to provided pattern - if time_pattern == "match": - times = np.tile(self.time, (number, 1)).T - elif time_pattern == "even": - if "time_increment" not in settings: - warnings.warn("'time_increment' not found in the dictionary, default set to 1.",stacklevel=2) - time_increment = 1 - else: - time_increment = settings["time_increment"] - - t = np.cumsum([time_increment]*n) - times = np.tile(t, (number, 1)).T - elif time_pattern == "random": - times = np.zeros((n, number)) - for i in range(number): - times[:, i] = tsmodel.random_time_index(n = n, **settings) # TODO: check that this does not break when unexpected keywords are passed in `settings` - else: - raise ValueError(f"Unknown time pattern: {time_pattern}") + # See also + # -------- - # apply surrogate method - if method == 'ar1sim': - if time_pattern != 'match': - raise ValueError('Only a matching time pattern is supported with this method') - else: - y_surr = tsmodel.ar1_sim(self.value, number, self.time) # CHECK: how does this handle the new time? + # pyleoclim.utils.tsmodel.ar1_sim : AR(1) simulator + # pyleoclim.utils.tsmodel.uar1_sim : maximum likelihood AR(1) simulator + # pyleoclim.utils.tsutils.phaseran2 : phase randomization + # pyleoclim.utils.tsutils.random_time_index : random time index vector according to a specific probability model - elif method == 'phaseran': - if self.is_evenly_spaced() and time_pattern != "random": - y_surr = tsutils.phaseran2(self.value, number) - else: - raise ValueError("Phase-randomization presently requires evenly-spaced series.") - - elif method == 'uar1': - # estimate theta with MLE - theta_hat = tsmodel.uar1_fit(self.value, self.time) - # generate surrogates - y_surr = np.empty_like(times) - for j in range(number): - y_surr[:,j] = tsmodel.uar1_sim(t = times[:,j], - tau_0=theta_hat[0],sigma_2_0=theta_hat[1]) - - # elif method == 'power-law': - # # TODO : implement Stochastic - # elif method == 'fBm': - # # TODO : implement Stochastic - - - # THIS SHOULD BE UNNECESSARY - # # reshape - # if len(np.shape(y_surr)) == 1: - # y_surr = y_surr[:, np.newaxis] - - # if len(np.shape(times)) == 1: - # times = times[:, np.newaxis] - - # wrap it all up with a bow - s_list = [] - for i, (t, y) in enumerate(zip(times.T,y_surr.T)): - ts = Series(time=t, value=y, - time_name=self.time_name, - time_unit=self.time_unit, - value_name=self.value_name, - value_unit=self.value_unit, - label = str(self.label or '') + " surr #" + str(i+1), - verbose=False, auto_time_params=True) - s_list.append(ts) - - surr = SurrogateSeries(series_list=s_list, - label = self.label, - surrogate_method=method, - surrogate_args=settings) - - return surr + # ''' + # settings = {} if settings is None else settings.copy() + + # if seed is not None: + # np.random.seed(seed) + + # if length is None: + # n = len(self.value) + # else: + # n = length + + # # generate time axes according to provided pattern + # if time_pattern == "match": + # times = np.tile(self.time, (number, 1)).T + # elif time_pattern == "even": + # if "time_increment" not in settings: + # warnings.warn("'time_increment' not found in the dictionary, default set to 1.",stacklevel=2) + # time_increment = np.median(np.diff(self.time)) + # else: + # time_increment = settings["time_increment"] + + # t = np.cumsum([time_increment]*n) + # times = np.tile(t, (number, 1)).T + # elif time_pattern == "random": + # times = np.zeros((n, number)) + # for i in range(number): + # times[:, i] = tsmodel.random_time_index(n = n, **settings) # TODO: check that this does not break when unexpected keywords are passed in `settings` + # else: + # raise ValueError(f"Unknown time pattern: {time_pattern}") + + # # apply surrogate method + # if method == 'ar1sim': + # if time_pattern != 'match': + # raise ValueError('Only a matching time pattern is supported with this method') + # else: + # y_surr = tsmodel.ar1_sim(self.value, number, self.time) # CHECK: how does this handle the new time? + + # elif method == 'phaseran': + # if self.is_evenly_spaced() and time_pattern != "random": + # y_surr = tsutils.phaseran2(self.value, number) + # else: + # raise ValueError("Phase-randomization presently requires evenly-spaced series.") + + # elif method == 'uar1': + # # estimate theta with MLE + # theta_hat = tsmodel.uar1_fit(self.value, self.time) + # # generate surrogates + # y_surr = np.empty_like(times) + # for j in range(number): + # y_surr[:,j] = tsmodel.uar1_sim(t = times[:,j], + # tau_0=theta_hat[0],sigma_2_0=theta_hat[1]) + + # # elif method == 'power-law': + # # # TODO : implement Stochastic + # # elif method == 'fBm': + # # # TODO : implement Stochastic + + # # wrap it all up with a bow + # s_list = [] + # for i, (t, y) in enumerate(zip(times.T,y_surr.T)): + # ts = Series(time=t, value=y, + # time_name=self.time_name, + # time_unit=self.time_unit, + # value_name=self.value_name, + # value_unit=self.value_unit, + # label = str(self.label or '') + " surr #" + str(i+1), + # verbose=False, auto_time_params=True) + # s_list.append(ts) + + # surr = SurrogateSeries(series_list=s_list, + # label = self.label, + # surrogate_method=method, + # surrogate_args=settings) + + # return surr def outliers(self,method='kmeans',remove=True, settings=None, fig_outliers=True, figsize_outliers=[10,4], plotoutliers_kwargs=None, savefigoutliers_settings=None, diff --git a/pyleoclim/core/surrogateseries.py b/pyleoclim/core/surrogateseries.py index 1fe4c32c..abc66ede 100644 --- a/pyleoclim/core/surrogateseries.py +++ b/pyleoclim/core/surrogateseries.py @@ -1,34 +1,246 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ -SurrogateSeries is a child of MultipleSeries, designed for Monte Carlo tests +SurrogateSeries is a child of EnsembleSeries, designed for parametric and non-parametric Monte Carlo tests """ -from ..core.multipleseries import MultipleSeries +from ..core.ensembleseries import EnsembleSeries +from ..core.series import Series +from ..utils import tsutils, tsmodel + +import numpy as np +import warnings supported_surrogates = frozenset(['ar1sim','phaseran', 'uar1']) # broadcast all supported surrogates as global variable, for exception handling -class SurrogateSeries(MultipleSeries): - ''' Object containing surrogate timeseries, usually obtained through recursive modeling (e.g., AR(1)) +class SurrogateSeries(EnsembleSeries): + ''' Object containing surrogate timeseries, obtained either by emulating an + existing series (see `from_series()`) or from a parametric model (see `from_param()`) - Surrogate Series is a child of MultipleSeries. All methods available for MultipleSeries are available for surrogate series. - EnsembleSeries would be a more logical choice, but it creates circular imports that break the package. + Surrogate Series is a child of EnsembleSeries. All methods available for EnsembleSeries are available for surrogate series. + ''' - def __init__(self, series_list, label, surrogate_method=None, surrogate_args=None): + def __init__(self, series_list=[], number=1, method=None, label=None, param=None, seed=None): + ''' + Initiatlize a SurrogateSeries object. The parameters below are then used for series generation and labeling. + + Parameters + ---------- + series_list : list + a list of pyleoclim.Series objects. The default is []. + + number : int + The number of surrogates to generate. The default is 1. + + method : str {ar1sim, phaseran, uar1} + The name of the method used to generate surrogates of the timeseries (no default) + + param : list + a list of parameters for the method in question. Ignored if the method does not require parameters. + + label : str + label of the collection of timeseries (e.g. 'SOI surrogates [AR(1) MLE]') + If not provided, defaults to "series surrogates [{method}]" + + Raises + ------ + ValueError + errors out if an unknown method is specified + + ''' self.series_list = series_list - self.surrogate_method = surrogate_method - self.surrogate_args = surrogate_args + self.method = method + self.param = param + self.number = number + self.seed = seed + if param is not None: + nparam = len(param) + if 'ar1' in method and nparam !=2: + raise ValueError("2 parameters are needed for this model") + elif method == 'phaseran': + warnings.warn("Phase-randomization is a non parametric method; the provided parameters will be ignored") + elif method in ['fBm','CN'] and nparam >1: + raise ValueError(f"1 parameter is needed for this model; only the first of the provided {nparam} will be used") + # refine the display name - if surrogate_method == 'ar1sim': - self.label = str(label or "series") + " surrogates [AR(1)]" - elif surrogate_method == 'phaseran': - self.label = str(label or "series") + " surrogates [phase-randomized]" - elif surrogate_method == 'uar1': - self.label = str(label or "series") + " surrogates [uar1]" + if label is None: + if method == 'ar1sim': + self.label = "AR(1) surrogates (MoM)" + elif method == 'phaseran': + self.label = "phase-randomized surrogates" + elif method == 'uar1': + self.label = "AR(1) surrogates (MLE)" + else: + raise ValueError(f"Unknown method: {self.method}. Please use one of {supported_surrogates}") + + def from_series(self, target_series): + ''' + Fashion the SurrogateSeries object after a target series + + Parameters + ---------- + target_series : Series object + target Series used to infer surrogate properties + + seed : int + Control random seed option for reproducibility + + Returns + ------- + surr : SurrogateSeries + + See also + -------- + + pyleoclim.utils.tsmodel.ar1_sim : AR(1) simulator + pyleoclim.utils.tsmodel.uar1_sim : maximum likelihood AR(1) simulator + pyleoclim.utils.tsutils.phaseran2 : phase randomization + + Examples + -------- + + SOI = pyleo.utils.load_dataset('SOI') + SOI_surr = pyleo.SurrogateSeries(method='phaseran', number=4) + SOI_surr.from_series(SOI) + fig, ax = SOI_surr.plot_traces() + SOI.plot(ax=ax,color='black',linewidth=1, ylim=[-6,3]) + ax.legend(loc='lower center', bbox_to_anchor=(0.5, 0), ncol=2) + ax.set_title(SOI_surr.label) + + ''' + #settings = {} if settings is None else settings.copy() + + if not isinstance(target_series, Series): + raise TypeError(f"Expected pyleo.Series, got: {type(target_series)}") + + if self.seed is not None: + np.random.default_rng(self.seed) + + # apply surrogate method + if self.method == 'ar1sim': + y_surr = tsmodel.ar1_sim(target_series.value, self.number, target_series.time) + + elif self.method == 'phaseran': + if target_series.is_evenly_spaced(): + y_surr = tsutils.phaseran2(target_series.value, self.number) + else: + raise ValueError("Phase-randomization presently requires evenly-spaced series.") + + elif self.method == 'uar1': + # estimate theta with MLE + tau, sigma_2 = tsmodel.uar1_fit(target_series.value, target_series.time) + self.param = [tau, sigma_2] # assign parameters for future use + # generate time axes according to provided pattern + times = np.squeeze(np.tile(target_series.time, (self.number, 1)).T) + # generate matrix + y_surr = tsmodel.uar1_sim(t = times, tau=tau, sigma_2=sigma_2) + + if self.number > 1: + s_list = [] + for i, y in enumerate(y_surr.T): + ts = target_series.copy() # copy Series + ts.value = y # replace value with y_surr column + ts.label = str(target_series.label or '') + " surr #" + str(i+1) + s_list.append(ts) else: - raise ValueError('Surrogate method should either be "ar1sim", "phaseran" or "uar1"') + ts = target_series.copy() # copy Series + ts.value = y_surr # replace value with y_surr column + ts.label = str(target_series.label or '') + " surr" + s_list = [ts] + + self.series_list = s_list + + def from_param(self, param, length=50, time_pattern = 'even', settings=None): + ''' + Simulate the SurrogateSeries from a parametric model + + Parameters + ---------- + param : list + model parameters (e.g. [tau, sigma0] for an AR(1) model) + length : int + Length of the series. Default: 50 + + time_pattern : str {even, random, specified} + The pattern used to generate the surrogate time axes + * 'even' uses an evenly-spaced time with spacing `delta_t` specified in settings (if not specified, defaults to 1.0) + * 'random' uses random_time_index() with specified distribution and parameters + Relevant settings are `delta_t_dist` and `param`. (default: 'exponential' with parameter = 1.0) + * 'specified': uses time axis `time` from `settings`. + + seed : int + Control random seed option for reproducibility + + settings : dict + Parameters for surrogate generator. See individual methods for details. + + Returns + ------- + surr : SurrogateSeries + + See also + -------- + + pyleoclim.utils.tsmodel.ar1_sim : AR(1) simulator + pyleoclim.utils.tsmodel.uar1_sim : maximum likelihood AR(1) simulator + pyleoclim.utils.tsmodel.random_time_index : Generate time increment vector according to a specific probability model + + Examples + -------- + ar1 = pyleo.SurrogateSeries(method='ar1sim', number=10) + ar1.from_param(length=100, param = [2,2]) + ar1.plot_envelope(title=rf'AR(1) synthetic series ($\tau={2},\sigma^2={2}$)') + + ''' + param = list(param) if param is not None else [] # coerce param into a list, no matter the original format + nparam = len(param) + + settings = {} if settings is None else settings.copy() + + if self.seed is not None: + np.random.seed(self.seed) + + # generate time axes according to provided pattern + if time_pattern == "even": + delta_t = settings["delta_t"] if "delta_t" in settings else 1.0 + t = np.cumsum([float(delta_t)]*length) + times = np.tile(t, (self.number, 1)).T + elif time_pattern == "random": + times = np.zeros((length, self.number)) + for i in range(self.number): + dist_name = settings['delta_t_dist'] if "delta_t_dist" in settings else "exponential" + dist_param = settings['param'] if "param" in settings else [1] + times[:, i] = tsmodel.random_time_index(length, dist_name,dist_param) + elif time_pattern == 'specified': + if "time" not in settings: + raise ValueError("'time' not found in settings") + else: + times = np.tile(settings["time"], (self.number, 1)).T + else: + raise ValueError(f"Unknown time pattern: {time_pattern}") - + # apply surrogate method + y_surr = np.empty_like(times) + + if self.method in ['uar1','ar1sim']: # + if nparam<2: + warnings.warn(f'The AR(1) model needs 2 parameters, tau and sigma2 (in that order); {nparam} provided. default values used, tau=5, sigma=0.5',UserWarning, stacklevel=2) + param = [5,0.5] + y_surr = tsmodel.uar1_sim(t = times, tau=param[0], sigma_2=param[1]) + + elif self.method == 'phaseran': + raise ValueError("Phase-randomization is only available in from_series().") + + # create the series_list + s_list = [] + for i, (t, y) in enumerate(zip(times.T,y_surr.T)): + ts = Series(time=t, value=y, + label = str(self.label or '') + " surr #" + str(i+1), + verbose=False, auto_time_params=True) + s_list.append(ts) + + self.series_list = s_list + diff --git a/pyleoclim/tests/test_core_PSD.py b/pyleoclim/tests/test_core_PSD.py index 8cc4cd64..3f2965d2 100644 --- a/pyleoclim/tests/test_core_PSD.py +++ b/pyleoclim/tests/test_core_PSD.py @@ -15,23 +15,15 @@ import pytest import pyleoclim as pyleo -def gen_ts(model='colored_noise',alpha=1, nt=100, f0=None, m=None, seed=None): - 'wrapper for gen_ts in pyleoclim' - - t,v = pyleo.utils.gen_ts(model=model,alpha=alpha, nt=nt, f0=f0, m=m, seed=seed) - ts=pyleo.Series(t,v, auto_time_params=True,verbose=False) - return ts - - # Tests below class TestUiPsdPlot: ''' Tests for PSD.plot() ''' - def test_plot_t0(self): + def test_plot_t0(self, gen_ts): ''' Test PSD.plot() with default parameters ''' - ts = gen_ts(nt=500) + ts = gen_ts psd = ts.spectral(method='mtm') fig, ax = psd.plot() pyleo.closefig(fig) @@ -40,10 +32,10 @@ class TestUiPsdSignifTest: ''' Tests for PSD.signif_test() ''' @pytest.mark.parametrize('method',['ar1sim','uar1','ar1asym']) - def test_signif_test_t0(self,method): + def test_signif_test_t0(self,method,gen_ts): ''' Test PSD.signif_test() with default parameters ''' - ts = gen_ts(nt=500) + ts = gen_ts psd = ts.spectral(method='mtm') psd_signif = psd.signif_test(number=10,method=method) fig, ax = psd_signif.plot() diff --git a/pyleoclim/tests/test_core_Series.py b/pyleoclim/tests/test_core_Series.py index a3c858b7..32a32ff5 100644 --- a/pyleoclim/tests/test_core_Series.py +++ b/pyleoclim/tests/test_core_Series.py @@ -25,14 +25,10 @@ import pathlib test_dirpath = pathlib.Path(__file__).parent.absolute() -#from urllib.request import urlopen import pyleoclim as pyleo -import pyleoclim.utils.tsmodel as tsmodel import pyleoclim.utils.tsbase as tsbase -from statsmodels.tsa.arima_process import arma_generate_sample -from scipy.stats import expon # a collection of useful functions @@ -537,66 +533,7 @@ def test_invalid(self): ts.sel(time=1, value=1) -class TestUISeriesSurrogates: - ''' Test Series.surrogates() - ''' - def test_surrogates_t0(self, p=2, eps=0.2): - ''' Generate AR(1) surrogates based on a AR(1) series with certain parameters, - and then evaluate and assert the parameters of the surrogates are correct - ''' - g = 0.5 # persistence - ar = [1, -g] - ma = [1, 0] - n = 1000 - ar1 = arma_generate_sample(ar, ma, nsample=n, scale=1) - ts = pyleo.Series(time=np.arange(1000), value=ar1) - - ts_surrs = ts.surrogates(number=p) - for ts_surr in ts_surrs.series_list: - g_surr = tsmodel.ar1_fit(ts_surr.value) - assert np.abs(g_surr-g) < eps - - @pytest.mark.parametrize('number',[1,5]) - def test_surrogates_uar1_match(self, number): - ts = gen_ts(nt=550, alpha=1.0) - # generate surrogates - surr = ts.surrogates(method = 'uar1', number = number, time_pattern ="match") - for i in range(number): - assert(np.allclose(surr.series_list[i].time, ts.time)) - - def test_surrogates_uar1_even(self, p=5): - ts = gen_ts(nt=550, alpha=1.0) - time_incr = np.median(np.diff(ts.time)) - # generate surrogates - surr = ts.surrogates(method = 'uar1', number = p, time_pattern ="even", settings={"time_increment" :time_incr}) - for i in range(p): - assert(np.allclose(tsmodel.inverse_cumsum(surr.series_list[i].time),time_incr)) - - def test_surrogates_uar1_random(self, p=5, tol = 0.5): - tau = 2 - sigma_2 = 1 - n = 500 - # generate time index - t = np.arange(1,(n+1)) - # create time series - ys = tsmodel.uar1_sim(t, tau_0=tau, sigma_2_0=sigma_2) - ts = pyleo.Series(time = t, value=ys, auto_time_params=True,verbose=False) - # generate surrogates default is exponential with parameter value 1 - surr = ts.surrogates(method = 'uar1', number = p, time_pattern ="random") - #surr = ts.surrogates(method = 'uar1', number = p, time_pattern ="uneven",settings={"delta_t_dist" :"poisson","param":[1]} ) - - for i in range(p): - delta_t = tsmodel.inverse_cumsum(surr.series_list[i].time) - # Compute the empirical cumulative distribution function (CDF) of the generated data - empirical_cdf, bins = np.histogram(delta_t, bins=100, density=True) - empirical_cdf = np.cumsum(empirical_cdf) * np.diff(bins) - # Compute the theoretical CDF of the Exponential distribution - theoretical_cdf = expon.cdf(bins[1:], scale=1) - # Trim theoretical_cdf to match the size of empirical_cdf - theoretical_cdf = theoretical_cdf[:len(empirical_cdf)] - # Compute the L2 norm (Euclidean distance) between empirical and theoretical CDFs - l2_norm = np.linalg.norm(empirical_cdf - theoretical_cdf) - assert(l2_norm0) - - - @pytest.mark.parametrize(('p', 'evenly_spaced'), [(1, True), (10, True), (1, False), (10, False)]) -def test_uar1_fit(p, evenly_spaced): +def test_uar1_fit(p, evenly_spaced, tol = 0.45): ''' Tests whether this method works well on an AR(1) process with known parameters and evenly spaced time points ''' # define tolerance - tol = .4 tau = 2 sigma_2 = 1 n = 500 + np.random.seed(108) if evenly_spaced: t_arr = np.tile(range(1,n), (p, 1)).T @@ -62,7 +59,7 @@ def test_uar1_fit(p, evenly_spaced): theta_hat_matrix = np.empty((p, 2)) # estimate parameters for each time series for j in range(p): - ys = tsmodel.uar1_sim(t = t_arr[:, j], tau_0 = tau, sigma_2_0=sigma_2) + ys = tsmodel.uar1_sim(t = t_arr[:, j], tau = tau, sigma_2=sigma_2) theta_hat_matrix[j,:] = tsmodel.uar1_fit(ys, t_arr[:, j]) # compute mean of estimated param for each simulate ts theta_hat_bar = np.mean(theta_hat_matrix, axis=0) diff --git a/pyleoclim/utils/correlation.py b/pyleoclim/utils/correlation.py index cdf045f4..9fba5874 100644 --- a/pyleoclim/utils/correlation.py +++ b/pyleoclim/utils/correlation.py @@ -709,6 +709,7 @@ def association(y1, y2, statistic='pearsonr',settings=None): acceptable_methods = ['linregress','pearsonr','spearmanr','pointbiserialr','kendalltau','weightedtau'] if statistic in acceptable_methods: func = getattr(stats, statistic) + print(args) res = func(y1,y2,**args) else: raise ValueError(f'Wrong statistic: {statistic}; acceptable choices are {acceptable_methods}') diff --git a/pyleoclim/utils/tsmodel.py b/pyleoclim/utils/tsmodel.py index d003483d..27fe171a 100644 --- a/pyleoclim/utils/tsmodel.py +++ b/pyleoclim/utils/tsmodel.py @@ -14,7 +14,7 @@ is_evenly_spaced ) from scipy import optimize -from scipy.optimize import minimize # for MLE estimation of tau_0 +from scipy.optimize import minimize # for MLE estimation of tau __all__ = [ @@ -151,7 +151,7 @@ def ar1_sim(y, p, t=None): n = np.size(y) ysim = np.empty(shape=(n, p)) # declare array - sig = np.std(y) + sig = np.std(y) # Not MLE estimate in any case if is_evenly_spaced(t): g = ar1_fit_evenly(y) @@ -179,7 +179,7 @@ def ar1_sim(y, p, t=None): def gen_ar1_evenly(t, g, scale=1, burnin=50): ''' Generate AR(1) series samples - MARK FOR DEPRECATION once ar1fit_ml is adopted + MARK FOR DEPRECATION once uar1_fit is adopted Wrapper for the function `statsmodels.tsa.arima_process.arma_generate_sample `_. used to generate an ARMA @@ -194,7 +194,7 @@ def gen_ar1_evenly(t, g, scale=1, burnin=50): lag-1 autocorrelation scale : float - The standard deviation of noise. + The standard deviation of the noise. burnin : int Number of observation at the beginning of the sample to drop. Used to reduce dependence on initial values. @@ -626,7 +626,7 @@ def n_ll_unevenly_spaced_ar1(theta, y, t): Parameters ---------- theta: array, length 2 - the first value is tau_0, the second value sigma^2. + the first value is tau, the second value sigma^2. y: array,length n The vector of observations. @@ -638,12 +638,12 @@ def n_ll_unevenly_spaced_ar1(theta, y, t): """ # define n n = len(y) - log_tau_0 = theta[0] - log_sigma_2_0 = theta[1] - tau_0 = np.exp(log_tau_0) - sigma_2 = np.exp(log_sigma_2_0) + log_tau = theta[0] + log_sigma_2 = theta[1] + tau = np.exp(log_tau) + sigma_2 = np.exp(log_sigma_2) delta = np.diff(t) - phi = np.exp((-delta / tau_0)) + phi = np.exp((-delta / tau)) term_1 = y[1:n] - (phi * y[0:(n-1)]) term_2 = pow(term_1, 2) term_3 = term_2 / (1- pow(phi, 2)) @@ -654,7 +654,7 @@ def n_ll_unevenly_spaced_ar1(theta, y, t): def uar1_fit(y, t): ''' - Maximum Likelihood Estimation of parameters tau_0 and sigma_2_0 + Maximum Likelihood Estimation of parameters tau and sigma_2 Parameters ---------- @@ -664,13 +664,13 @@ def uar1_fit(y, t): Time index values of the time series Returns: - An array containing the estimated parameters tau_0_hat and sigma_2_hat, first entry is tau_0_hat, second entry is sigma_2_hat + An array containing the estimated parameters tau_hat and sigma_2_hat, first entry is tau_hat, second entry is sigma_2_hat ------- None. ''' - # obtain initial value for tau_0 + # obtain initial value for tau tau_initial_value = tau_estimation(y= y, t = t) # obtain initial value for sifma_2_0 sigma_2_initial_value = np.var(y) @@ -681,7 +681,7 @@ def uar1_fit(y, t): return theta_hat -def uar1_sim(t, tau_0=5, sigma_2_0=2): +def uar1_sim(t, tau, sigma_2=1): """ Generate a time series of length n from an autoregressive process of order 1 with evenly/unevenly spaced time points. @@ -691,10 +691,10 @@ def uar1_sim(t, tau_0=5, sigma_2_0=2): t : array Time axis - tau_0 : float + tau : float Time decay parameter of the AR(1) model ($\phi = e^{-\tau}$) - sigma_2_0 : float + sigma_2 : float Variance of the innovations Returns @@ -707,27 +707,32 @@ def uar1_sim(t, tau_0=5, sigma_2_0=2): -------- pyleoclim.utils.tsmodel.uar1_fit : Maximumum likelihood estimate of AR(1) parameters - pyleoclim.utils.tsmodel.random_time_index : Generate time increment vector according to a specific probability model """ - n = len(t) - ys = np.zeros(n) # create empty vector - # generate unevenly spaced AR(1) - z = np.random.normal(loc=0, scale=1, size=n) - ys[0] = z[0] - for i in range(n-1): - delta_i = t[i+1] - t[i] - phi_i = np.exp(-delta_i / tau_0) - sigma_2_i = sigma_2_0 * (1-pow(phi_i, 2)) - sigma_i = np.sqrt(sigma_2_i) - ys[i+1] = phi_i * ys[i] + sigma_i * z[i] - - return ys + if t.ndim == 1: # add extraneous dimension if t is 1d, to write only one loop. + n = len(t); p = 1 + t = t[:,np.newaxis] + else: + n, p = t.shape + + # generate innovations + z = np.random.normal(loc=0, scale=1, size=(n,p)) + y = np.copy(z) # initialize AR(1) vectors + # fill the array + for j in range(p): # Note: this shouldn't work but it does! + for i in range(1, n): + delta_i = t[i,j] - t[i-1,j] + phi = np.exp(-delta_i / tau) + sigma_i = np.sqrt(sigma_2 * (1-phi**2)) + y[i,j] = phi * y[i-1,j] + sigma_i * z[i,j] + + y = np.squeeze(y) # squeeze superfluous dimensions + return y def inverse_cumsum(arr): return np.diff(np.concatenate(([0], arr))) -def random_time_index(n, delta_t_dist = "exponential", param = [1]): +def random_time_index(n, delta_t_dist = "exponential", param = [1.0]): ''' Generate a random time index vector according to a specific probability model