From 62e7a09afb6c82e83c91c841af019dcda202aea6 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Mon, 11 Dec 2023 16:19:33 -0500 Subject: [PATCH 1/9] Stack periods --- CHANGES.rst | 8 + docs/notebooks/sdba-advanced.ipynb | 102 +--------- tests/test_calendar.py | 35 ++++ tests/test_sdba/test_processing.py | 44 ----- xclim/core/calendar.py | 296 +++++++++++++++++++++++++++++ xclim/sdba/processing.py | 148 +++------------ xclim/testing/helpers.py | 2 +- 7 files changed, 378 insertions(+), 257 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index f69c45d54..f0a098aa6 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -2,6 +2,14 @@ Changelog ========= +v0.48.0 (unreleased) +-------------------- +Contributors to this version: Pascal Bourgault (:user:`aulemahal`). + +New features and enhancements +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +* New ``xclim.core.calendar.stack_periods`` and ``unstack_periods`` for performing ``rolling(time=...).construct(..., stride=...)`` but with non-uniform temporal periods like years or months. They replace ``xclim.sdba.processing.construct_moving_yearly_window`` and ``unpack_moving_yearly_window`` which are deprecated and will be removed in a future release. + v0.47.0 (2023-12-01) -------------------- Contributors to this version: Juliette Lavoie (:user:`juliettelavoie`), Pascal Bourgault (:user:`aulemahal`), Trevor James Smith (:user:`Zeitsperre`), David Huard (:user:`huard`), Éric Dupuis (:user:`coxipi`). diff --git a/docs/notebooks/sdba-advanced.ipynb b/docs/notebooks/sdba-advanced.ipynb index 0354a0eb7..df0899c0e 100644 --- a/docs/notebooks/sdba-advanced.ipynb +++ b/docs/notebooks/sdba-advanced.ipynb @@ -346,21 +346,18 @@ "\n", "Some Adjustment methods require that the adjusted data (`sim`) be of the same length (same number of points) than the training data (`ref` and `hist`). These requirements often ensure conservation of statistical properties and a better representation of the climate change signal over the long adjusted timeseries.\n", "\n", - "In opposition to a conventional \"rolling window\", here it is the _years_ that are the base units of the window, not the elements themselves. `xclim` implements `sdba.construct_moving_yearly_window` and `sdba.unpack_moving_yearly_window` to manipulate data in that goal. The \"construct\" function cuts the data in overlapping windows of a certain length (in years) and stacks them along a new `\"movingdim\"` dimension, alike to xarray's `da.rolling(time=win).construct('movingdim')`, but with yearly steps. The step between each window can also be controlled. This argument is an indicator of how many years overlap between each window. With a value of 1 (the default), a window will have `window - 1` years overlapping with the previous one. `step = window` will result in no overlap at all.\n", + "In opposition to a conventional \"rolling window\", here it is the _years_ that are the base units of the window, not the elements themselves. `xclim` implements `xc.core.calendar.stack_periods` and `xc.core.calendar.unstack_periods` to manipulate data in that goal. The \"stack\" function cuts the data in overlapping windows of a certain length and stacks them along a new `\"period\"` dimension, alike to xarray's `da.rolling(time=win).construct('period')`, but with yearly steps. The stride (or step) between each window can also be controlled. This argument is an indicator of how many years overlap between each window. With a value of 1, a window will have `window - 1` years overlapping with the previous one. The default (`None`) is to have `stride = window` will result in no overlap at all. The default units in which `window` and `stride` are given is a year (\"YS\"), but can be changed with argument `freq`.\n", "\n", - "By default, the result is chunked along this `'movingdim'` dimension. For this reason, the method is expected to be more computationally efficient (when using `dask`) than looping over the windows.\n", + "By chunking the result along this `'period'` dimension, it is expected to be more computationally efficient (when using `dask`) than looping over the windows with a for-loop (or a `GroupyBy`)\n", "\n", "Note that this results in two restrictions:\n", "\n", "1. The constructed array has the same \"time\" axis for all windows. This is a problem if the actual _year_ is of importance for the adjustment, but this is not the case for any of xclim's current adjustment methods.\n", "2. The input timeseries must be in a calendar with uniform year lengths. For daily data, this means only the \"360_day\", \"noleap\" and \"all_leap\" calendars are supported.\n", "\n", - "The \"unpack\" function does the opposite : it concatenates the windows together to recreate the original timeseries.\n", - "The time points that are not part of a window will not appear in the reconstructed timeseries.\n", - "If `append_ends` is True, the reconstructed timeseries will go from the first time point of the first window to the last time point of the last window. In the middle, the central `step` years are kept from each window.\n", - "If `append_ends` is False, only the central `step` years are kept from each window. Which means the final timeseries has `(window - step) / 2` years missing on either side, with the extra year missing on the right in case of an odd `(window - step)`. We are missing data, but the contribution from each window is equal.\n", + "The \"unstack\" function does the opposite : it concatenates the windows together to recreate the original timeseries. It only works for the no-overlap case where `stride = window` and for the non-ambiguous one where `stride` divides `window` into an odd number (N) of parts. In that latter situation, the middle parts of each period are kept when reconstructing the timeseries, in addition to the first (last) parts of the first (last) period needed to get a full timeseries.\n", "\n", - "Here, as `ref` and `hist` cover 15 years, we will use a window of 15 on sim. With a step of two (2), this means the first window goes from 2000 to 2014 (inclusive). The last window goes from 2016 to 2030. `window - step = 13`, so six (6) years will be missing at the beginning of the final `scen` and seven (7) years at the end." + "Here, as `ref` and `hist` cover 15 years, we will use a window of 15 on sim. With a stride of 5 years, this means the first window goes from 2000 to 2014 (inclusive). Then 2005-2019, 2010-2024 and 2015-2029. The last year will be dropped." ] }, { @@ -382,18 +379,9 @@ "metadata": {}, "outputs": [], "source": [ - "sim" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from xclim.sdba import construct_moving_yearly_window, unpack_moving_yearly_window\n", + "from xclim.core.calendar import stack_periods, unstack_periods\n", "\n", - "sim_win = construct_moving_yearly_window(sim, window=15, step=2)\n", + "sim_win = stack_periods(sim, window=15, stride=5)\n", "sim_win" ] }, @@ -401,24 +389,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Here, we retrieve the full timeseries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "scen_win = unpack_moving_yearly_window(QDM.adjust(sim_win), append_ends=True)\n", - "scen_win" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Whereas here, we have gaps at the edges." + "Here, we retrieve the full timeseries (minus the lasy year that couldn't fit in any window)." ] }, { @@ -427,63 +398,10 @@ "metadata": {}, "outputs": [], "source": [ - "scen_win = unpack_moving_yearly_window(QDM.adjust(sim_win), append_ends=False)\n", + "scen_win = unstack_periods(QDM.adjust(sim_win))\n", "scen_win" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here is another short example, with an uneven number of years. Here `sim` goes from 2000 to 2029 (30 years instead of 31). With a step of 2 and a window of 15, the first window goes again from 2000 to 2014, but the last one is now from 2014 to 2028. The next window would be 2016-2030, but that last year doesn't exist." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sim_win = construct_moving_yearly_window(\n", - " sim.sel(time=slice(\"2000\", \"2029\")), window=15, step=2\n", - ")\n", - "sim_win" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here, we don't recover the full timeseries, even when we append the ends, because 2029 is not part of a window." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sim2 = unpack_moving_yearly_window(sim_win, append_ends=True)\n", - "sim2" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Without appending the ends, the final timeseries is from 2006 to 2021, 6 years missing at the beginning, like last time and **8** years missing at the end." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sim2 = unpack_moving_yearly_window(sim_win, append_ends=False)\n", - "sim2" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -854,9 +772,7 @@ "hist.isel(location=1).plot(label=\"hist\", linewidth=lw)\n", "ref.isel(location=1).plot(label=\"ref\", linewidth=lw)\n", "ref_future.isel(location=1).plot(label=\"ref_future\", linewidth=lw)\n", - "leg = plt.legend()\n", - "for legobj in leg.legendHandles:\n", - " legobj.set_linewidth(2.0)" + "leg = plt.legend()" ] }, { diff --git a/tests/test_calendar.py b/tests/test_calendar.py index 625e603e9..f071127ca 100644 --- a/tests/test_calendar.py +++ b/tests/test_calendar.py @@ -29,7 +29,9 @@ max_doy, parse_offset, percentile_doy, + stack_periods, time_bnds, + unstack_periods, ) @@ -703,3 +705,36 @@ def test_convert_doy(): np.testing.assert_allclose( out.isel(lat=0), [31.0, 200.48, 190.0, 59.83607, 299.71885] ) + + +@pytest.mark.parametrize("cftime", [True, False]) +@pytest.mark.parametrize( + "w,s,m,f,ss", + [(30, 10, None, "YS", 0), (3, 1, None, "QS-DEC", 60), (6, None, None, "MS", 0)], +) +def test_stack_periods(tas_series, cftime, w, s, m, f, ss): + da = tas_series(np.arange(365 * 50), cftime=cftime, start="2000-01-01") + + da_stck = stack_periods(da, window=w, stride=s, min_length=m, freq=f) + + assert "period_length" in da_stck.coords + assert bool(da_stck.period.attrs["unequal_periods"]) is (not f.startswith("Y")) + + da2 = unstack_periods(da_stck) + + xr.testing.assert_identical(da2, da.isel(time=slice(ss, da2.time.size + ss))) + + +def test_stack_periods_special(tas_series): + da = tas_series( + np.arange(365 * 48 + 12), cftime=True, start="2004-01-01" + ).convert_calendar("noleap") + + da_stck = stack_periods(da, dim="horizon") + np.testing.assert_array_equal(da_stck.horizon_length, 10950) + + with pytest.raises(ValueError, match="can't find the window"): + unstack_periods(da_stck) + + da2 = unstack_periods(da_stck.drop_vars("horizon_length"), dim="horizon") + xr.testing.assert_identical(da2, da.isel(time=slice(0, da2.time.size))) diff --git a/tests/test_sdba/test_processing.py b/tests/test_sdba/test_processing.py index a59e18313..294ef980c 100644 --- a/tests/test_sdba/test_processing.py +++ b/tests/test_sdba/test_processing.py @@ -10,7 +10,6 @@ from xclim.sdba.base import Grouper from xclim.sdba.processing import ( adapt_freq, - construct_moving_yearly_window, escore, from_additive_space, jitter, @@ -21,7 +20,6 @@ stack_variables, standardize, to_additive_space, - unpack_moving_yearly_window, unstack_variables, unstandardize, ) @@ -282,45 +280,3 @@ def test_stack_variables(open_dataset): ds1p = unstack_variables(da1) xr.testing.assert_equal(ds1, ds1p) - - -@pytest.mark.parametrize( - "window,step,lengths", - [ - (1, 1, 151), - (5, 5, 30), - (10, 10, 15), - (25, 25, 6), - (50, 50, 3), - (None, None, 131), - ], -) -def test_construct_moving_yearly_window(open_dataset, window, step, lengths): - ds = open_dataset("sdba/CanESM2_1950-2100.nc") - - calls = {k: v for k, v in dict(window=window, step=step).items() if v is not None} - da_windowed = construct_moving_yearly_window(ds.tasmax, **calls) - - assert len(da_windowed) == lengths - - -def test_construct_moving_yearly_window_standard_calendar(tasmin_series): - tasmin = tasmin_series(np.zeros(365 * 30), start="1997-01-01", units="degC") - - with pytest.raises(ValueError): - construct_moving_yearly_window(tasmin) - - -@pytest.mark.parametrize("append_ends", [True, False]) -def test_unpack_moving_yearly_window(open_dataset, append_ends): - tasmax = open_dataset("sdba/ahccd_1950-2013.nc").tasmax - - tasmax_windowed = construct_moving_yearly_window(tasmax) - - tx_deconstructed = unpack_moving_yearly_window( - tasmax_windowed, append_ends=append_ends - ) - if append_ends: - np.testing.assert_array_equal(tasmax, tx_deconstructed) - else: - assert len(tx_deconstructed.time) < len(tasmax.time) diff --git a/xclim/core/calendar.py b/xclim/core/calendar.py index dfbb3cb61..9f8c34998 100644 --- a/xclim/core/calendar.py +++ b/xclim/core/calendar.py @@ -15,6 +15,7 @@ import xarray as xr from xarray.coding.cftime_offsets import to_cftime_datetime from xarray.coding.cftimeindex import CFTimeIndex +from xarray.core import dtypes from xarray.core.resample import DataArrayResample, DatasetResample from xclim.core.utils import DayOfYearStr, uses_dask @@ -1588,3 +1589,298 @@ def get_doys(start, end, inclusive): mask["time"] = da.time return da.where(mask, drop=drop) + + +def _month_is_first_period_month(time, freq): + """Returns True if the given time is from the first month of freq.""" + if isinstance(time, cftime.datetime): + frqM = xr.coding.cftime_offsets.to_offset("MS") + frq = xr.coding.cftime_offsets.to_offset(freq) + if frqM.onOffset(time): + return frq.onOffset(time) + return frq.onOffset(frqM.rollback(time)) + # Pandas + time = pd.Timestamp(time) + frqM = pd.tseries.frequencies.to_offset("MS") + frq = pd.tseries.frequencies.to_offset(freq) + if frqM.is_on_offset(time): + return frq.is_on_offset(time) + return frq.is_on_offset(frqM.rollback(time)) + + +def stack_periods( + da: xr.Dataset | xr.DataArray, + window: int = 30, + stride: int | None = None, + min_length: int | None = None, + freq: str = "YS", + dim: str = "period", + start: str = "1970-01-01", + pad_value=dtypes.NA, +): + """Construct a multi-period array + + Stack different equal-length periods of `da` into a new 'period' dimension. + + This is similar to ``da.rolling(time=window).construct(dim, stride=stride)``, but adapted for arguments + in terms of a base temporal frequency that might be non uniform (years, months, etc). + It is reversible for some cases (see `stride`). A rolling-construct method will be much more performant for uniform periods (days, weeks). + + Parameters + ---------- + da : xr.Dataset or xr.DataArray + An xarray object with a `time` dimension. + Must have an uniform timestep length. + Output might be strange if this does not use an uniform calendar (noleap, 360_day, all_leap). + window : int + The length of the moving window as a multiple of ``freq``. + stride : int, optional + At which interval to take the windows, as a multiple of ``freq``. + For the operation to be reversible with :py:func:`unstack_periods`, it must divide `window` into an odd number of parts. + Default is `window` (no overlap between periods). + min_length : int, optional + Windows shorter than this are not included in the output. + Given as a multiple of ``freq``. Defaults is ``window`` (every window must be complete). + Similar to the ``min_periods`` argument of ``da.rolling``. + If ``freq`` is annual or quarterly and ``min_length == ``window``, the first period is considered complete + if the first timestep is in the first month of the period. + freq : str + Units of ``window``, ``stride`` and ``min_length``, as a frequency string. + Must be larger or equal to the data's sampling frequency. + Note that this function offers an easier interface for non uniform period (like years or months) + but is much slower than a rolling-construct method. + dim : str + The new dimension name. + start : str + The `start` argument passed to :py:func:`xarray.date_range` to generate the new placeholder + time coordinate. + pad_value: Any + When some periods are shorter than others, this value is used to pad them at the end. + Passed directly as argument ``fill_value`` to :py:func:`xarray.concat`, the default is the same as on that function. + + Return + ------ + xr.DataArray + A DataArray with a new `period` dimension and a `time` dimension with the length of the longest window. + The new time coordinate has the same frequency as the input data but is generated using + :py:func:`xarray.date_range` with the given `start` value. + That coordinate is the same for all periods, depending on the choice of ``window`` and ``freq``, it might make sense. + But for unequal periods or non-uniform calendars, it will certainly not. + If ``stride`` is a divisor of ``window``, the correct timeseries can be reconstructed with :py:func:`unstack_periods`. + The coordinate of `period` is the first timestep of each windows. + """ + from xclim.core.units import ( # Import in function to avoid cyclical imports + ensure_cf_units, + infer_sampling_units, + ) + + stride = stride or window + min_length = min_length or window + + srcfreq = xr.infer_freq(da.time) + cal = da.time.dt.calendar + use_cftime = da.time.dtype == "O" + + # Convert integer inputs to freq strings + mult, *args = parse_offset(freq) + win_frq = construct_offset(mult * window, *args) + strd_frq = construct_offset(mult * stride, *args) + minl_frq = construct_offset(mult * min_length, *args) + + # The same time coord as da, but with one extra element. + # This way, the last window's last index is not returned as None by xarray's grouper. + time2 = xr.DataArray( + xr.date_range( + da.time[0].item(), + freq=srcfreq, + calendar=cal, + periods=da.time.size + 1, + use_cftime=use_cftime, + ), + dims=("time",), + name="time", + ) + + periods = [] + longest = 0 + for begin, strd_slc in da.resample(time=strd_frq).groups.items(): + win_resamp = time2.isel(time=slice(strd_slc.start, None)).resample(time=win_frq) + # Get slice for first group + win_slc = win_resamp.groupers[0].group_indices[0] + if min_length < window: + min_resamp = time2.isel(time=slice(strd_slc.start, None)).resample( + time=minl_frq + ) + min_slc = min_resamp.groupers[0].group_indices[0] + open_ended = min_slc.stop is None + else: + # The end of the group slice is None if no outside-group value was found after the last element + # As we added an extra step to time2, we avoid the case where a group ends exactly on the last element of ds. + open_ended = win_slc.stop is None + if open_ended: + # Too short, we got to the end + break + if ( + strd_slc.start == 0 + and parse_offset(freq)[1] in "YAQ" + and min_length == window + and not _month_is_first_period_month(da.time[0].item(), freq) + ): + # For annual or quartely frequencies (which can be anchor-based), if the first time is not in the first month of the first period, + # then the first period is incomplete but by a fractional amount. + continue + periods.append( + slice( + strd_slc.start + win_slc.start, + (strd_slc.start + win_slc.stop) + if win_slc.stop is not None + else da.time.size, + ) + ) + + # Make coordinates + lengths = xr.DataArray( + [slc.stop - slc.start for slc in periods], + dims=(dim,), + attrs={"long_name": "Length of each period"}, + ) + longest = lengths.max().item() + # Length as a pint-ready array : with proper units, but values are not usable as indexes anymore + m, u = infer_sampling_units(da) + lengths = lengths * m + lengths.attrs["units"] = ensure_cf_units(u) + # Start points for each periods + remember parameters for unstacking + starts = xr.DataArray( + [da.time[slc.start].item() for slc in periods], + dims=(dim,), + attrs={ + "long_name": "Start of the period", + # Save parameters so that we can unstack. + "window": window, + "stride": stride, + "freq": freq, + "unequal_lengths": int(len(np.unique(lengths)) > 1), + }, + ) + # The "fake" axis that all periods share + fake_time = xr.date_range( + start, periods=longest, freq=srcfreq, calendar=cal, use_cftime=use_cftime + ) + # Slice and concat along new dim. We drop the index and add a new one so that xarray can concat them together. + out = xr.concat( + [ + da.isel(time=slc) + .drop_vars("time") + .assign_coords(time=np.arange(slc.stop - slc.start)) + for slc in periods + ], + dim, + join="outer", + fill_value=pad_value, + ) + out = out.assign_coords( + time=(("time",), fake_time, da.time.attrs.copy()), + **{f"{dim}_length": lengths, dim: starts}, + ) + out.time.attrs.update(long_name="Placeholder time axis") + return out + + +def unstack_periods(da: xr.DataArray | xr.Dataset, dim: str = "period"): + """Unstack an array constructed with :py:func:`stack_periods`. + + Can only work with periods stacked with a ``stride`` that divides ``window`` in a odd number of sections. + When ``stride`` is smaller than ``window``, only the centermost stride of each window is kept, + except for the beginning and end which are taken from the first and last windows. + + Parameters + ---------- + da : xr.DataArray + As constructed by :py:func:`stack_periods`, attributes of the period coordinates must have been perserved. + dim : str + The period dimension name. + """ + from xclim.core.units import infer_sampling_units + + try: + starts = da[dim] + window = starts.attrs["window"] + stride = starts.attrs["stride"] + freq = starts.attrs["freq"] + unequal_lengths = bool(starts.attrs["unequal_lengths"]) + except (AttributeError, KeyError) as err: + raise ValueError( + f"`unstack_periods` can't find the window, stride and freq attributes on the {dim} coordiantes." + ) from err + + if unequal_lengths: + try: + lengths = da[f"{dim}_length"] + except KeyError as err: + raise ValueError( + f"`unstack_periods` can't find the `{dim}_length` coordinate." + ) from err + # Get length as number of points + m, u = infer_sampling_units(da.time) + lengths = lengths // m + else: + lengths = xr.DataArray([da.time.size] * da[dim].size, dims=(dim,)) + + time_as_delta = da.time - da.time[0] + if da.time.dtype == "O": + # cftime can't add with np.timedelta64 (restriction comes from numpy which refuses to add O with m8) + time_as_delta = pd.TimedeltaIndex( + time_as_delta + ).to_pytimedelta() # this array is O, numpy complies + else: + # Xarray will return int when iterating over datetime values, this returns timestamps + starts = pd.DatetimeIndex(starts) + + def _reconstruct_time(start): + times = time_as_delta + start + return xr.DataArray(times, dims=("time",), coords={"time": times}, name="time") + + # Easy case: + if window == stride: + # just concat them all + periods = [] + for i, (start, length) in enumerate(zip(starts.values, lengths.values)): + real_time = _reconstruct_time(start) + periods.append( + da.isel(**{dim: i}, drop=True) + .isel(time=slice(0, length)) + .assign_coords(time=real_time.isel(time=slice(0, length))) + ) + return xr.concat(periods, "time") + + # Difficult and ambiguous case + if (window / stride) % 2 != 1: + raise NotImplementedError( + "`unstack_periods` can't work with strides that do not divide the window into an odd number of parts." + f"Got {window} / {stride} which is not an odd integer." + ) + + # Non-ambiguous overlapping case + Nwin = window // stride + mid = (Nwin - 1) // 2 # index of the center window + + mult, *args = parse_offset(freq) + strd_frq = construct_offset(mult * stride, *args) + + periods = [] + for i, (start, length) in enumerate(zip(starts.values, lengths.values)): + real_time = _reconstruct_time(start) + slices = real_time.resample(time=strd_frq).groupers[0].group_indices + if i == 0: + slc = slice(slices[0].start, min(slices[mid].stop, length)) + elif i == da.period.size - 1: + slc = slice(slices[mid].start, min(slices[Nwin - 1].stop or length, length)) + else: + slc = slice(slices[mid].start, min(slices[mid].stop, length)) + periods.append( + da.isel(**{dim: i}, drop=True) + .isel(time=slc) + .assign_coords(time=real_time.isel(time=slc)) + ) + + return xr.concat(periods, "time") diff --git a/xclim/sdba/processing.py b/xclim/sdba/processing.py index 08ddddb76..f6d833ae8 100644 --- a/xclim/sdba/processing.py +++ b/xclim/sdba/processing.py @@ -12,7 +12,13 @@ import xarray as xr from xarray.core.utils import get_temp_dimname -from xclim.core.calendar import get_calendar, max_doy, parse_offset +from xclim.core.calendar import ( + get_calendar, + max_doy, + parse_offset, + stack_periods, + unstack_periods, +) from xclim.core.formatting import update_xclim_history from xclim.core.units import convert_units_to, infer_context, units from xclim.core.utils import uses_dask @@ -480,133 +486,37 @@ def _get_number_of_elements_by_year(time): def construct_moving_yearly_window( da: xr.Dataset, window: int = 21, step: int = 1, dim: str = "movingwin" ): - """Construct a moving window DataArray. - - Stack windows of `da` in a new 'movingwin' dimension. - Windows are always made of full years, so calendar with non-uniform year lengths are not supported. - - Windows are constructed starting at the beginning of `da`, if number of given years is not - a multiple of `step`, then the last year(s) will be missing as a supplementary window would be incomplete. - - Parameters - ---------- - da : xr.Dataset - A DataArray with a `time` dimension. - window : int - The length of the moving window as a number of years. - step : int - The step between each window as a number of years. - dim : str - The new dimension name. If given, must also be given to `unpack_moving_yearly_window`. - - Return - ------ - xr.DataArray - A DataArray with a new `movingwin` dimension and a `time` dimension with a length of 1 window. - This assumes downstream algorithms do not make use of the _absolute_ year of the data. - The correct timeseries can be reconstructed with :py:func:`unpack_moving_yearly_window`. - The coordinates of `movingwin` are the first date of the windows. + """Deprecated function. + Use :py:func:`xclim.core.calendar.stack_periods` instead, renaming ``step`` to ``stride``. + Beware of the different default value for `dim` ("period"). """ - # Get number of samples per year (and perform checks) - N_in_year = _get_number_of_elements_by_year(da.time) - - # Number of samples in a window - N = window * N_in_year - - first_slice = da.isel(time=slice(0, N)) - first_slice = first_slice.expand_dims({dim: np.atleast_1d(first_slice.time[0])}) - daw = [first_slice] - - i_start = N_in_year * step - # This is the first time I use `while` in real python code. What an event. - while i_start + N <= da.time.size: - # Cut and add _full_ slices only, partial window are thrown out - # Use isel so that we don't need to deal with a starting date. - slc = da.isel(time=slice(i_start, i_start + N)) - slc = slc.expand_dims({dim: np.atleast_1d(slc.time[0])}) - slc["time"] = first_slice.time - daw.append(slc) - i_start += N_in_year * step - - daw = xr.concat(daw, dim) - return daw + warnings.warn( + FutureWarning, + ( + "`construct_moving_yearly_window` is deprecated and will be removed in a future version. " + f"Please use xclim.core.calendar.stack_periods(da, window={window}, stride={step}, dim='{dim}', freq='YS') instead." + ), + ) + return stack_periods(da, window=window, stride=step, dim=dim, freq="YS") def unpack_moving_yearly_window( da: xr.DataArray, dim: str = "movingwin", append_ends: bool = True ): - """Unpack a constructed moving window dataset to a normal timeseries, only keeping the central data. - - Unpack DataArrays created with :py:func:`construct_moving_yearly_window` and recreate a timeseries data. - If `append_ends` is False, only keeps the central non-overlapping years. The final timeseries will be - (window - step) years shorter than the initial one. If `append_ends` is True, the time points from first and last - windows will be included in the final timeseries. - - The time points that are not in a window will never be included in the final timeseries. - The window length and window step are inferred from the coordinates. - - Parameters - ---------- - da : xr.DataArray - As constructed by :py:func:`construct_moving_yearly_window`. - dim : str - The window dimension name as given to the construction function. - append_ends : bool - Whether to append the ends of the timeseries - If False, the final timeseries will be (window - step) years shorter than the initial one, - but all windows will contribute equally. - If True, the year before the middle years of the first window and the years after the middle years of the last - window are appended to the middle years. The final timeseries will be the same length as the initial timeseries - if the windows span the whole timeseries. - The time steps that are not in a window will be left out of the final timeseries. + """Deprecated function. + Use :py:func:`xclim.core.calendar.unstack_periods` instead. + Beware of the different default value for `dim` ("period"). The new function always behaves like ``appends_ends=True``. """ - # Get number of samples by year (and perform checks) - N_in_year = _get_number_of_elements_by_year(da.time) - - # Might be smaller than the original moving window, doesn't matter - window = da.time.size / N_in_year - - if window % 1 != 0: - warnings.warn( - f"Incomplete data received as number of years covered is not an integer ({window})" - ) - - # Get step in number of years - days_in_year = max_doy[get_calendar(da)] - step = np.unique(da[dim].diff(dim).dt.days / days_in_year) - if len(step) > 1: - raise ValueError("The spacing between the windows is not equal.") - step = int(step[0]) - - # Which years to keep: length step, in the middle of window - left = int((window - step) // 2) # first year to keep - - # Keep only the middle years - da_mid = da.isel(time=slice(left * N_in_year, (left + step) * N_in_year)) - - out = [] - for win_start in da_mid[dim]: - slc = da_mid.sel({dim: win_start}).drop_vars(dim) - dt = win_start.values - da_mid[dim][0].values - slc["time"] = slc.time + dt - out.append(slc) - - if append_ends: - # add front end at the front - out.insert( - 0, da.isel({dim: 0, "time": slice(None, left * N_in_year)}).drop_vars(dim) - ) - # add back end at the back - back_end = da.isel( - {dim: -1, "time": slice((left + step) * N_in_year, None)} - ).drop_vars(dim) - dt = da.isel({dim: -1})[dim].values - da.isel({dim: 0})[dim].values - back_end["time"] = back_end.time + dt - out.append(back_end) - - return xr.concat(out, "time") + warnings.warn( + FutureWarning, + ( + "`unpack_moving_yearly_window` is deprecated and will be removed in a future version. " + f"Please use xclim.core.calendar.unstack_periods(da, dim='{dim}') instead." + ), + ) + return unstack_periods(da, dim=dim) @update_xclim_history diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index 85d1f953a..3b135b6b5 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -221,7 +221,7 @@ def add_example_file_paths(cache_dir: Path) -> dict[str]: def test_timeseries( values, variable, - start="7/1/2000", + start="2000-01-07", units=None, freq="D", as_dataset=False, From c508e2731218fc9d0d54071da904a32d28549893 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Mon, 11 Dec 2023 16:22:29 -0500 Subject: [PATCH 2/9] add comments to stack_periods --- xclim/core/calendar.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/xclim/core/calendar.py b/xclim/core/calendar.py index 9f8c34998..838813e64 100644 --- a/xclim/core/calendar.py +++ b/xclim/core/calendar.py @@ -1703,11 +1703,13 @@ def stack_periods( periods = [] longest = 0 + # Iterate over strides, but recompute the full window for each stride start for begin, strd_slc in da.resample(time=strd_frq).groups.items(): win_resamp = time2.isel(time=slice(strd_slc.start, None)).resample(time=win_frq) # Get slice for first group win_slc = win_resamp.groupers[0].group_indices[0] if min_length < window: + # If we ask for a min_length period instead is it complete ? min_resamp = time2.isel(time=slice(strd_slc.start, None)).resample( time=minl_frq ) @@ -1824,8 +1826,10 @@ def unstack_periods(da: xr.DataArray | xr.Dataset, dim: str = "period"): m, u = infer_sampling_units(da.time) lengths = lengths // m else: + # It is acceptable to lose "{dim}_length" if they were all equal lengths = xr.DataArray([da.time.size] * da[dim].size, dims=(dim,)) + # Convert from the fake axis to the real one time_as_delta = da.time - da.time[0] if da.time.dtype == "O": # cftime can't add with np.timedelta64 (restriction comes from numpy which refuses to add O with m8) From 7a4e87e9c8f65e25b21ec1fe948db56460a56445 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Mon, 11 Dec 2023 17:22:01 -0500 Subject: [PATCH 3/9] Fix tests --- tests/test_calendar.py | 2 +- xclim/testing/helpers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_calendar.py b/tests/test_calendar.py index f071127ca..4a547f79b 100644 --- a/tests/test_calendar.py +++ b/tests/test_calendar.py @@ -718,7 +718,7 @@ def test_stack_periods(tas_series, cftime, w, s, m, f, ss): da_stck = stack_periods(da, window=w, stride=s, min_length=m, freq=f) assert "period_length" in da_stck.coords - assert bool(da_stck.period.attrs["unequal_periods"]) is (not f.startswith("Y")) + assert bool(da_stck.period.attrs["unequal_lengths"]) is (not f.startswith("Y")) da2 = unstack_periods(da_stck) diff --git a/xclim/testing/helpers.py b/xclim/testing/helpers.py index 3b135b6b5..08fae0661 100644 --- a/xclim/testing/helpers.py +++ b/xclim/testing/helpers.py @@ -221,7 +221,7 @@ def add_example_file_paths(cache_dir: Path) -> dict[str]: def test_timeseries( values, variable, - start="2000-01-07", + start="2000-07-01", units=None, freq="D", as_dataset=False, From 6c20ad9f49291377ead4978b75a77382a15f120e Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Mon, 11 Dec 2023 17:33:34 -0500 Subject: [PATCH 4/9] Rephrase and add warning about nonuniform years in notebook --- docs/notebooks/sdba-advanced.ipynb | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/notebooks/sdba-advanced.ipynb b/docs/notebooks/sdba-advanced.ipynb index df0899c0e..3e3d95098 100644 --- a/docs/notebooks/sdba-advanced.ipynb +++ b/docs/notebooks/sdba-advanced.ipynb @@ -357,7 +357,13 @@ "\n", "The \"unstack\" function does the opposite : it concatenates the windows together to recreate the original timeseries. It only works for the no-overlap case where `stride = window` and for the non-ambiguous one where `stride` divides `window` into an odd number (N) of parts. In that latter situation, the middle parts of each period are kept when reconstructing the timeseries, in addition to the first (last) parts of the first (last) period needed to get a full timeseries.\n", "\n", - "Here, as `ref` and `hist` cover 15 years, we will use a window of 15 on sim. With a stride of 5 years, this means the first window goes from 2000 to 2014 (inclusive). Then 2005-2019, 2010-2024 and 2015-2029. The last year will be dropped." + "Quantile Delta Mapping requires that the adjustment period should be of a length similar to the training one. As our `ref` and `hist` cover 15 years, we will transform `sim` by stacking windows of 15 years. With a stride of 5 years, this means the first window goes from 2000 to 2014 (inclusive). Then 2005-2019, 2010-2024 and 2015-2029. The last year will be dropped.\n", + "\n", + "
\n", + "\n", + "In the following example, `QDM` is configurated with `group=\"time.dayofyear\"` which will perform the adjustment for each day of year (doy) separately. When using `stack_periods` the extracted windows are all concatenated along the new `period` axis and they all share the same time coordinate. As such, for the doy information to make sense, we must use a calendar with uniform year lengths. Otherwise, the doys would shift one day at each leap year.\n", + "\n", + "
" ] }, { From 40e1d9f729f49d470934dc434c0d5adab2da38b8 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Mon, 11 Dec 2023 18:14:05 -0500 Subject: [PATCH 5/9] Remove wrong test line - more words in explanation --- docs/notebooks/sdba-advanced.ipynb | 2 +- tests/test_calendar.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/notebooks/sdba-advanced.ipynb b/docs/notebooks/sdba-advanced.ipynb index 3e3d95098..3ee096608 100644 --- a/docs/notebooks/sdba-advanced.ipynb +++ b/docs/notebooks/sdba-advanced.ipynb @@ -357,7 +357,7 @@ "\n", "The \"unstack\" function does the opposite : it concatenates the windows together to recreate the original timeseries. It only works for the no-overlap case where `stride = window` and for the non-ambiguous one where `stride` divides `window` into an odd number (N) of parts. In that latter situation, the middle parts of each period are kept when reconstructing the timeseries, in addition to the first (last) parts of the first (last) period needed to get a full timeseries.\n", "\n", - "Quantile Delta Mapping requires that the adjustment period should be of a length similar to the training one. As our `ref` and `hist` cover 15 years, we will transform `sim` by stacking windows of 15 years. With a stride of 5 years, this means the first window goes from 2000 to 2014 (inclusive). Then 2005-2019, 2010-2024 and 2015-2029. The last year will be dropped.\n", + "Quantile Delta Mapping requires that the adjustment period should be of a length similar to the training one. As our `ref` and `hist` cover 15 years but `sim` covers 31 years, we will transform `sim` by stacking windows of 15 years. With a stride of 5 years, this means the first window goes from 2000 to 2014 (inclusive). Then 2005-2019, 2010-2024 and 2015-2029. The last year will be dropped as it can't be included in any complete window.\n", "\n", "
\n", "\n", diff --git a/tests/test_calendar.py b/tests/test_calendar.py index 4a547f79b..46f116828 100644 --- a/tests/test_calendar.py +++ b/tests/test_calendar.py @@ -718,7 +718,6 @@ def test_stack_periods(tas_series, cftime, w, s, m, f, ss): da_stck = stack_periods(da, window=w, stride=s, min_length=m, freq=f) assert "period_length" in da_stck.coords - assert bool(da_stck.period.attrs["unequal_lengths"]) is (not f.startswith("Y")) da2 = unstack_periods(da_stck) From 4eeaadae7ecc6b7a05840a574ce98ae2d94293f8 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Tue, 12 Dec 2023 17:13:35 -0500 Subject: [PATCH 6/9] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Éric Dupuis <71575674+coxipi@users.noreply.github.com> --- xclim/core/calendar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/xclim/core/calendar.py b/xclim/core/calendar.py index 838813e64..86eead275 100644 --- a/xclim/core/calendar.py +++ b/xclim/core/calendar.py @@ -1640,7 +1640,7 @@ def stack_periods( Default is `window` (no overlap between periods). min_length : int, optional Windows shorter than this are not included in the output. - Given as a multiple of ``freq``. Defaults is ``window`` (every window must be complete). + Given as a multiple of ``freq``. Default is ``window`` (every window must be complete). Similar to the ``min_periods`` argument of ``da.rolling``. If ``freq`` is annual or quarterly and ``min_length == ``window``, the first period is considered complete if the first timestep is in the first month of the period. @@ -1812,7 +1812,7 @@ def unstack_periods(da: xr.DataArray | xr.Dataset, dim: str = "period"): unequal_lengths = bool(starts.attrs["unequal_lengths"]) except (AttributeError, KeyError) as err: raise ValueError( - f"`unstack_periods` can't find the window, stride and freq attributes on the {dim} coordiantes." + f"`unstack_periods` can't find the window, stride and freq attributes on the {dim} coordinates." ) from err if unequal_lengths: From 472f26171e3799bf32984142fd0e2129bdac3e5c Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Thu, 14 Dec 2023 15:45:59 -0500 Subject: [PATCH 7/9] Fail if days wont align --- tests/test_calendar.py | 15 +++++++++----- xclim/core/calendar.py | 47 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/tests/test_calendar.py b/tests/test_calendar.py index 46f116828..d73b31fe3 100644 --- a/tests/test_calendar.py +++ b/tests/test_calendar.py @@ -713,9 +713,11 @@ def test_convert_doy(): [(30, 10, None, "YS", 0), (3, 1, None, "QS-DEC", 60), (6, None, None, "MS", 0)], ) def test_stack_periods(tas_series, cftime, w, s, m, f, ss): - da = tas_series(np.arange(365 * 50), cftime=cftime, start="2000-01-01") + da = tas_series(np.arange(365 * 50), start="2000-01-01", cftime=cftime) - da_stck = stack_periods(da, window=w, stride=s, min_length=m, freq=f) + da_stck = stack_periods( + da, window=w, stride=s, min_length=m, freq=f, align_days=False + ) assert "period_length" in da_stck.coords @@ -725,9 +727,12 @@ def test_stack_periods(tas_series, cftime, w, s, m, f, ss): def test_stack_periods_special(tas_series): - da = tas_series( - np.arange(365 * 48 + 12), cftime=True, start="2004-01-01" - ).convert_calendar("noleap") + da = tas_series(np.arange(365 * 48 + 12), cftime=True, start="2004-01-01") + + with pytest.raises(ValueError, match="unaligned day-of-year"): + stack_periods(da) + + da = da.convert_calendar("noleap") da_stck = stack_periods(da, dim="horizon") np.testing.assert_array_equal(da_stck.horizon_length, 10950) diff --git a/xclim/core/calendar.py b/xclim/core/calendar.py index 838813e64..87c76e52c 100644 --- a/xclim/core/calendar.py +++ b/xclim/core/calendar.py @@ -46,8 +46,10 @@ "percentile_doy", "resample_doy", "select_time", + "stack_periods", "time_bnds", "uniform_calendars", + "unstack_periods", "within_bnds_doy", "yearly_interpolated_doy", "yearly_random_doy", @@ -1616,6 +1618,7 @@ def stack_periods( freq: str = "YS", dim: str = "period", start: str = "1970-01-01", + align_days: bool = True, pad_value=dtypes.NA, ): """Construct a multi-period array @@ -1654,6 +1657,11 @@ def stack_periods( start : str The `start` argument passed to :py:func:`xarray.date_range` to generate the new placeholder time coordinate. + align_days : bool + When True (default), an error is raised if the output would have unaligned days across periods. + If `freq = 'YS'`, day-of-year alignment is checked and if `freq` is "MS" or "QS", we check day-in-month. + Only uniform-calendar will pass the test for `freq='YS'`. For other frequencies, only the `360_day` calendar will work. + This check is ignored if the sampling rate of the data is coarser than "D". pad_value: Any When some periods are shorter than others, this value is used to pad them at the end. Passed directly as argument ``fill_value`` to :py:func:`xarray.concat`, the default is the same as on that function. @@ -1676,11 +1684,33 @@ def stack_periods( stride = stride or window min_length = min_length or window + if stride > window: + raise ValueError( + f"Stride must be less than or equal to window. Got {stride} > {window}." + ) srcfreq = xr.infer_freq(da.time) cal = da.time.dt.calendar use_cftime = da.time.dtype == "O" + if ( + compare_offsets(srcfreq, "<=", "D") + and align_days + and ( + (freq.startswith(("Y", "A")) and cal not in uniform_calendars) + or (freq.startswith(("Q", "M")) and window > 1 and cal != "360_day") + ) + ): + if freq.startswith(("Y", "A")): + u = "year" + else: + u = "month" + raise ValueError( + f"Stacking {window}{freq} periods will result in unaligned day-of-{u}. " + f"Consider converting the calendar of your data to one with uniform {u} lengths, " + "or pass `align_days=False` to disable this check." + ) + # Convert integer inputs to freq strings mult, *args = parse_offset(freq) win_frq = construct_offset(mult * window, *args) @@ -1801,6 +1831,23 @@ def unstack_periods(da: xr.DataArray | xr.Dataset, dim: str = "period"): As constructed by :py:func:`stack_periods`, attributes of the period coordinates must have been perserved. dim : str The period dimension name. + + Notes + ----- + The following table shows which strides are included (``o``) in the unstacked output. + in this example, ``stride`` was a fifth of ``window`` and ``min_length`` was 4 times ``stride``. + The row index ``i`` the period index in the stacked datast, columns are the stride-long section of the original timeseries. + + .. table:: Unstacking example with ``stride < window``. + + === === === === === === === === + i 0 1 2 3 4 5 6 + === === === === === === === === + 3 x x o o + 2 x x o x x + 1 x x o x x + 0 o o o x x + === === === === === === === === """ from xclim.core.units import infer_sampling_units From e7d7d03c46a43b9b8bf171eefcb4bd01e1052f5a Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 15 Dec 2023 15:57:28 -0500 Subject: [PATCH 8/9] Make it work with xr <2023.5 --- environment.yml | 2 +- xclim/core/calendar.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/environment.yml b/environment.yml index 3be9a6afe..586642f6d 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,7 @@ channels: - conda-forge - defaults dependencies: - - python >=3.8 + - python ==3.8 - astroid - boltons >=20.1 - bottleneck >=1.3.1 diff --git a/xclim/core/calendar.py b/xclim/core/calendar.py index 58346466b..b24fcdc84 100644 --- a/xclim/core/calendar.py +++ b/xclim/core/calendar.py @@ -1737,13 +1737,13 @@ def stack_periods( for begin, strd_slc in da.resample(time=strd_frq).groups.items(): win_resamp = time2.isel(time=slice(strd_slc.start, None)).resample(time=win_frq) # Get slice for first group - win_slc = win_resamp.groupers[0].group_indices[0] + win_slc = win_resamp._group_indices[0] if min_length < window: # If we ask for a min_length period instead is it complete ? min_resamp = time2.isel(time=slice(strd_slc.start, None)).resample( time=minl_frq ) - min_slc = min_resamp.groupers[0].group_indices[0] + min_slc = min_resamp._group_indices[0] open_ended = min_slc.stop is None else: # The end of the group slice is None if no outside-group value was found after the last element @@ -1921,7 +1921,7 @@ def _reconstruct_time(start): periods = [] for i, (start, length) in enumerate(zip(starts.values, lengths.values)): real_time = _reconstruct_time(start) - slices = real_time.resample(time=strd_frq).groupers[0].group_indices + slices = real_time.resample(time=strd_frq)._group_indices if i == 0: slc = slice(slices[0].start, min(slices[mid].stop, length)) elif i == da.period.size - 1: From b4c3028f74c9e152d418360ca8641a2bd6fbb6fe Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Fri, 15 Dec 2023 16:54:59 -0500 Subject: [PATCH 9/9] forgot test pin in env! --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 586642f6d..3be9a6afe 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,7 @@ channels: - conda-forge - defaults dependencies: - - python ==3.8 + - python >=3.8 - astroid - boltons >=20.1 - bottleneck >=1.3.1