Skip to content

Commit

Permalink
interpolate_na: Add max_gap support. (#3302)
Browse files Browse the repository at this point in the history
* interpolate_na: Add maxgap support.

* Add docs.

* Add requires_bottleneck to test.

* Review comments.

* Update xarray/core/dataarray.py

Co-Authored-By: Maximilian Roos <5635139+max-sixty@users.noreply.github.com>

* Update xarray/core/dataset.py

Co-Authored-By: Maximilian Roos <5635139+max-sixty@users.noreply.github.com>

* maxgap → max_gap

* update whats-new

* update computation.rst

* Better support uniformly spaced coordinates. Split legnths, interp test

* Raise error for max_gap and irregularly spaced coordinates + test

* rework.

* Use pandas checks for index duplication and monotonicity.

* Progress + add datetime.

* nicer error message

* A few fstrings.

* finish up timedelta max_gap.

* fix whats-new

* small fixes.

* fix dan's test.

* remove redundant test.

* nicer error message.

* Add xfailed cftime tests

* better error checking and tests.

* typing.

* update docstrings

* scipy intersphinx

* fix tests

* add bottleneck testing decorator.
  • Loading branch information
dcherian authored Nov 15, 2019
1 parent 7b4a286 commit ee9da17
Show file tree
Hide file tree
Showing 7 changed files with 322 additions and 54 deletions.
3 changes: 3 additions & 0 deletions doc/computation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ for filling missing values via 1D interpolation.
Note that xarray slightly diverges from the pandas ``interpolate`` syntax by
providing the ``use_coordinate`` keyword which facilitates a clear specification
of which values to use as the index in the interpolation.
xarray also provides the ``max_gap`` keyword argument to limit the interpolation to
data gaps of length ``max_gap`` or smaller. See :py:meth:`~xarray.DataArray.interpolate_na`
for more.

Aggregation
===========
Expand Down
11 changes: 6 additions & 5 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,9 +340,10 @@
# Example configuration for intersphinx: refer to the Python standard library.
intersphinx_mapping = {
"python": ("https://docs.python.org/3/", None),
"pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
"iris": ("http://scitools.org.uk/iris/docs/latest/", None),
"numpy": ("https://docs.scipy.org/doc/numpy/", None),
"numba": ("https://numba.pydata.org/numba-doc/latest/", None),
"matplotlib": ("https://matplotlib.org/", None),
"pandas": ("https://pandas.pydata.org/pandas-docs/stable", None),
"iris": ("https://scitools.org.uk/iris/docs/latest", None),
"numpy": ("https://docs.scipy.org/doc/numpy", None),
"scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
"numba": ("https://numba.pydata.org/numba-doc/latest", None),
"matplotlib": ("https://matplotlib.org", None),
}
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ Breaking changes

New Features
~~~~~~~~~~~~

- Added the ``max_gap`` kwarg to :py:meth:`~xarray.DataArray.interpolate_na` and
:py:meth:`~xarray.Dataset.interpolate_na`. This controls the maximum size of the data
gap that will be filled by interpolation. By `Deepak Cherian <https://github.com/dcherian>`_.
- :py:meth:`Dataset.drop_sel` & :py:meth:`DataArray.drop_sel` have been added for dropping labels.
:py:meth:`Dataset.drop_vars` & :py:meth:`DataArray.drop_vars` have been added for
dropping variables (including coordinates). The existing ``drop`` methods remain as a backward compatible
Expand Down
58 changes: 42 additions & 16 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -2018,44 +2018,69 @@ def fillna(self, value: Any) -> "DataArray":

def interpolate_na(
self,
dim=None,
dim: Hashable = None,
method: str = "linear",
limit: int = None,
use_coordinate: Union[bool, str] = True,
max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None,
**kwargs: Any,
) -> "DataArray":
"""Interpolate values according to different methods.
"""Fill in NaNs by interpolating according to different methods.
Parameters
----------
dim : str
Specifies the dimension along which to interpolate.
method : {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
'polynomial', 'barycentric', 'krog', 'pchip',
'spline', 'akima'}, optional
method : str, optional
String indicating which method to use for interpolation:
- 'linear': linear interpolation (Default). Additional keyword
arguments are passed to ``numpy.interp``
- 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
'polynomial': are passed to ``scipy.interpolate.interp1d``. If
method=='polynomial', the ``order`` keyword argument must also be
arguments are passed to :py:func:`numpy.interp`
- 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'polynomial':
are passed to :py:func:`scipy.interpolate.interp1d`. If
``method='polynomial'``, the ``order`` keyword argument must also be
provided.
- 'barycentric', 'krog', 'pchip', 'spline', and `akima`: use their
respective``scipy.interpolate`` classes.
use_coordinate : boolean or str, default True
- 'barycentric', 'krog', 'pchip', 'spline', 'akima': use their
respective :py:class:`scipy.interpolate` classes.
use_coordinate : bool, str, default True
Specifies which index to use as the x values in the interpolation
formulated as `y = f(x)`. If False, values are treated as if
eqaully-spaced along `dim`. If True, the IndexVariable `dim` is
used. If use_coordinate is a string, it specifies the name of a
eqaully-spaced along ``dim``. If True, the IndexVariable `dim` is
used. If ``use_coordinate`` is a string, it specifies the name of a
coordinate variariable to use as the index.
limit : int, default None
Maximum number of consecutive NaNs to fill. Must be greater than 0
or None for no limit.
or None for no limit. This filling is done regardless of the size of
the gap in the data. To only interpolate over gaps less than a given length,
see ``max_gap``.
max_gap: int, float, str, pandas.Timedelta, numpy.timedelta64, default None.
Maximum size of gap, a continuous sequence of NaNs, that will be filled.
Use None for no limit. When interpolating along a datetime64 dimension
and ``use_coordinate=True``, ``max_gap`` can be one of the following:
- a string that is valid input for pandas.to_timedelta
- a :py:class:`numpy.timedelta64` object
- a :py:class:`pandas.Timedelta` object
Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled
dimensions has not been implemented yet. Gap length is defined as the difference
between coordinate values at the first data point after a gap and the last value
before a gap. For gaps at the beginning (end), gap length is defined as the difference
between coordinate values at the first (last) valid data point and the first (last) NaN.
For example, consider::
<xarray.DataArray (x: 9)>
array([nan, nan, nan, 1., nan, nan, 4., nan, nan])
Coordinates:
* x (x) int64 0 1 2 3 4 5 6 7 8
The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively
kwargs : dict, optional
parameters passed verbatim to the underlying interpolation function
Returns
-------
DataArray
interpolated: DataArray
Filled in DataArray.
See also
--------
Expand All @@ -2070,6 +2095,7 @@ def interpolate_na(
method=method,
limit=limit,
use_coordinate=use_coordinate,
max_gap=max_gap,
**kwargs,
)

Expand Down
60 changes: 42 additions & 18 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3908,42 +3908,65 @@ def interpolate_na(
method: str = "linear",
limit: int = None,
use_coordinate: Union[bool, Hashable] = True,
max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None,
**kwargs: Any,
) -> "Dataset":
"""Interpolate values according to different methods.
"""Fill in NaNs by interpolating according to different methods.
Parameters
----------
dim : Hashable
dim : str
Specifies the dimension along which to interpolate.
method : {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
'polynomial', 'barycentric', 'krog', 'pchip',
'spline'}, optional
method : str, optional
String indicating which method to use for interpolation:
- 'linear': linear interpolation (Default). Additional keyword
arguments are passed to ``numpy.interp``
- 'nearest', 'zero', 'slinear', 'quadratic', 'cubic',
'polynomial': are passed to ``scipy.interpolate.interp1d``. If
method=='polynomial', the ``order`` keyword argument must also be
arguments are passed to :py:func:`numpy.interp`
- 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'polynomial':
are passed to :py:func:`scipy.interpolate.interp1d`. If
``method='polynomial'``, the ``order`` keyword argument must also be
provided.
- 'barycentric', 'krog', 'pchip', 'spline': use their respective
``scipy.interpolate`` classes.
use_coordinate : boolean or str, default True
- 'barycentric', 'krog', 'pchip', 'spline', 'akima': use their
respective :py:class:`scipy.interpolate` classes.
use_coordinate : bool, str, default True
Specifies which index to use as the x values in the interpolation
formulated as `y = f(x)`. If False, values are treated as if
eqaully-spaced along `dim`. If True, the IndexVariable `dim` is
used. If use_coordinate is a string, it specifies the name of a
eqaully-spaced along ``dim``. If True, the IndexVariable `dim` is
used. If ``use_coordinate`` is a string, it specifies the name of a
coordinate variariable to use as the index.
limit : int, default None
Maximum number of consecutive NaNs to fill. Must be greater than 0
or None for no limit.
kwargs : any
parameters passed verbatim to the underlying interplation function
or None for no limit. This filling is done regardless of the size of
the gap in the data. To only interpolate over gaps less than a given length,
see ``max_gap``.
max_gap: int, float, str, pandas.Timedelta, numpy.timedelta64, default None.
Maximum size of gap, a continuous sequence of NaNs, that will be filled.
Use None for no limit. When interpolating along a datetime64 dimension
and ``use_coordinate=True``, ``max_gap`` can be one of the following:
- a string that is valid input for pandas.to_timedelta
- a :py:class:`numpy.timedelta64` object
- a :py:class:`pandas.Timedelta` object
Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled
dimensions has not been implemented yet. Gap length is defined as the difference
between coordinate values at the first data point after a gap and the last value
before a gap. For gaps at the beginning (end), gap length is defined as the difference
between coordinate values at the first (last) valid data point and the first (last) NaN.
For example, consider::
<xarray.DataArray (x: 9)>
array([nan, nan, nan, 1., nan, nan, 4., nan, nan])
Coordinates:
* x (x) int64 0 1 2 3 4 5 6 7 8
The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively
kwargs : dict, optional
parameters passed verbatim to the underlying interpolation function
Returns
-------
Dataset
interpolated: Dataset
Filled in Dataset.
See also
--------
Expand All @@ -3959,6 +3982,7 @@ def interpolate_na(
method=method,
limit=limit,
use_coordinate=use_coordinate,
max_gap=max_gap,
**kwargs,
)
return new
Expand Down
110 changes: 98 additions & 12 deletions xarray/core/missing.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,46 @@
import warnings
from functools import partial
from typing import Any, Callable, Dict, Sequence
from numbers import Number
from typing import Any, Callable, Dict, Hashable, Sequence, Union

import numpy as np
import pandas as pd

from . import utils
from .common import _contains_datetime_like_objects
from .common import _contains_datetime_like_objects, ones_like
from .computation import apply_ufunc
from .duck_array_ops import dask_array_type
from .utils import OrderedSet, is_scalar
from .variable import Variable, broadcast_variables


def _get_nan_block_lengths(obj, dim: Hashable, index: Variable):
"""
Return an object where each NaN element in 'obj' is replaced by the
length of the gap the element is in.
"""

# make variable so that we get broadcasting for free
index = Variable([dim], index)

# algorithm from https://github.com/pydata/xarray/pull/3302#discussion_r324707072
arange = ones_like(obj) * index
valid = obj.notnull()
valid_arange = arange.where(valid)
cumulative_nans = valid_arange.ffill(dim=dim).fillna(index[0])

nan_block_lengths = (
cumulative_nans.diff(dim=dim, label="upper")
.reindex({dim: obj[dim]})
.where(valid)
.bfill(dim=dim)
.where(~valid, 0)
.fillna(index[-1] - valid_arange.max())
)

return nan_block_lengths


class BaseInterpolator:
"""Generic interpolator class for normalizing interpolation methods
"""
Expand Down Expand Up @@ -178,7 +206,7 @@ def _apply_over_vars_with_dim(func, self, dim=None, **kwargs):
return ds


def get_clean_interp_index(arr, dim, use_coordinate=True):
def get_clean_interp_index(arr, dim: Hashable, use_coordinate: Union[str, bool] = True):
"""get index to use for x values in interpolation.
If use_coordinate is True, the coordinate that shares the name of the
Expand All @@ -195,23 +223,33 @@ def get_clean_interp_index(arr, dim, use_coordinate=True):
index = arr.coords[use_coordinate]
if index.ndim != 1:
raise ValueError(
"Coordinates used for interpolation must be 1D, "
"%s is %dD." % (use_coordinate, index.ndim)
f"Coordinates used for interpolation must be 1D, "
f"{use_coordinate} is {index.ndim}D."
)
index = index.to_index()

# TODO: index.name is None for multiindexes
# set name for nice error messages below
if isinstance(index, pd.MultiIndex):
index.name = dim

if not index.is_monotonic:
raise ValueError(f"Index {index.name!r} must be monotonically increasing")

if not index.is_unique:
raise ValueError(f"Index {index.name!r} has duplicate values")

# raise if index cannot be cast to a float (e.g. MultiIndex)
try:
index = index.values.astype(np.float64)
except (TypeError, ValueError):
# pandas raises a TypeError
# xarray/nuppy raise a ValueError
# xarray/numpy raise a ValueError
raise TypeError(
"Index must be castable to float64 to support"
"interpolation, got: %s" % type(index)
f"Index {index.name!r} must be castable to float64 to support "
f"interpolation, got {type(index).__name__}."
)
# check index sorting now so we can skip it later
if not (np.diff(index) > 0).all():
raise ValueError("Index must be monotonicly increasing")

else:
axis = arr.get_axis_num(dim)
index = np.arange(arr.shape[axis], dtype=np.float64)
Expand All @@ -220,7 +258,13 @@ def get_clean_interp_index(arr, dim, use_coordinate=True):


def interp_na(
self, dim=None, use_coordinate=True, method="linear", limit=None, **kwargs
self,
dim: Hashable = None,
use_coordinate: Union[bool, str] = True,
method: str = "linear",
limit: int = None,
max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None,
**kwargs,
):
"""Interpolate values according to different methods.
"""
Expand All @@ -230,6 +274,40 @@ def interp_na(
if limit is not None:
valids = _get_valid_fill_mask(self, dim, limit)

if max_gap is not None:
max_type = type(max_gap).__name__
if not is_scalar(max_gap):
raise ValueError("max_gap must be a scalar.")

if (
dim in self.indexes
and isinstance(self.indexes[dim], pd.DatetimeIndex)
and use_coordinate
):
if not isinstance(max_gap, (np.timedelta64, pd.Timedelta, str)):
raise TypeError(
f"Underlying index is DatetimeIndex. Expected max_gap of type str, pandas.Timedelta or numpy.timedelta64 but received {max_type}"
)

if isinstance(max_gap, str):
try:
max_gap = pd.to_timedelta(max_gap)
except ValueError:
raise ValueError(
f"Could not convert {max_gap!r} to timedelta64 using pandas.to_timedelta"
)

if isinstance(max_gap, pd.Timedelta):
max_gap = np.timedelta64(max_gap.value, "ns")

max_gap = np.timedelta64(max_gap, "ns").astype(np.float64)

if not use_coordinate:
if not isinstance(max_gap, (Number, np.number)):
raise TypeError(
f"Expected integer or floating point max_gap since use_coordinate=False. Received {max_type}."
)

# method
index = get_clean_interp_index(self, dim, use_coordinate=use_coordinate)
interp_class, kwargs = _get_interpolator(method, **kwargs)
Expand All @@ -253,6 +331,14 @@ def interp_na(
if limit is not None:
arr = arr.where(valids)

if max_gap is not None:
if dim not in self.coords:
raise NotImplementedError(
"max_gap not implemented for unlabeled coordinates yet."
)
nan_block_lengths = _get_nan_block_lengths(self, dim, index)
arr = arr.where(nan_block_lengths <= max_gap)

return arr


Expand Down
Loading

0 comments on commit ee9da17

Please sign in to comment.