Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implemented pad with new-indexes #4974

Closed
wants to merge 16 commits into from
Closed
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ v0.17.1 (unreleased)

New Features
~~~~~~~~~~~~
- Now :py:meth:`DataArray.pad` and :py:meth:`Dataset.pad` accept a tuple of indexes
as its arguments. In this case, these values will be used as the newly extended parts
fujiisoup marked this conversation as resolved.
Show resolved Hide resolved
of the IndexVariable.
By `Keisuke Fujii <https://github.com/fujiisoup>`_.


Breaking changes
Expand Down
21 changes: 20 additions & 1 deletion xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -3788,7 +3788,9 @@ def polyfit(

def pad(
self,
pad_width: Mapping[Hashable, Union[int, Tuple[int, int]]] = None,
pad_width: Mapping[
Hashable, Union[int, Tuple[Union[int, Sequence], Union[int, Sequence]]]
] = None,
mode: str = "constant",
stat_length: Union[
int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]]
Expand Down Expand Up @@ -3818,6 +3820,11 @@ def pad(
Mapping with the form of {dim: (pad_before, pad_after)}
describing the number of values padded along each dimension.
{dim: pad} is a shortcut for pad_before = pad_after = pad
Note that having np.nan in IndexVariable loses most of the useful
functionalities of xarray. To avoid this problem, an iterable,
such as a list or np.array, can be used for either pad_before or pad_after.
In this case, these values will be used for an IndexVariable and preventing
from the loss of functionalities.
mode : str, default: "constant"
One of the following string values (taken from numpy docs)

Expand Down Expand Up @@ -3942,6 +3949,18 @@ def pad(
* x (x) float64 nan 0.0 1.0 nan
* y (y) int64 10 20 30 40
z (x) float64 nan 100.0 200.0 nan

>>> da.pad(x=([-2, -1], [2]))
dcherian marked this conversation as resolved.
Show resolved Hide resolved
<xarray.DataArray (x: 5, y: 4)>
array([[nan, nan, nan, nan],
[nan, nan, nan, nan],
[ 0., 1., 2., 3.],
[10., 11., 12., 13.],
[nan, nan, nan, nan]])
Coordinates:
* x (x) int64 -2 -1 0 1 2
* y (y) int64 10 20 30 40
z (x) float64 nan nan 100.0 200.0 nan
"""
ds = self._to_temp_dataset().pad(
pad_width=pad_width,
Expand Down
52 changes: 49 additions & 3 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6496,7 +6496,9 @@ def polyfit(

def pad(
self,
pad_width: Mapping[Hashable, Union[int, Tuple[int, int]]] = None,
pad_width: Mapping[
Hashable, Union[int, Tuple[Union[int, Sequence], Union[int, Sequence]]]
] = None,
mode: str = "constant",
stat_length: Union[
int, Tuple[int, int], Mapping[Hashable, Tuple[int, int]]
Expand All @@ -6522,10 +6524,15 @@ def pad(

Parameters
----------
pad_width : mapping of hashable to tuple of int
pad_width : mapping of hashable to tuple of int or Iterable.
fujiisoup marked this conversation as resolved.
Show resolved Hide resolved
Mapping with the form of {dim: (pad_before, pad_after)}
describing the number of values padded along each dimension.
{dim: pad} is a shortcut for pad_before = pad_after = pad
Note that having np.nan in IndexVariable loses most of the useful
functionalities of xarray. To avoid this problem, an iterable,
fujiisoup marked this conversation as resolved.
Show resolved Hide resolved
such as a list or np.array, can be used for either pad_before or pad_after.
In this case, these values will be used for an IndexVariable and preventing
fujiisoup marked this conversation as resolved.
Show resolved Hide resolved
from the loss of functionalities.
mode : str, default: "constant"
One of the following string values (taken from numpy docs).

Expand Down Expand Up @@ -6622,6 +6629,14 @@ def pad(
Dimensions without coordinates: x
Data variables:
foo (x) float64 nan 0.0 1.0 2.0 3.0 4.0 nan nan
>>> ds = xr.Dataset({"foo": ("x", range(3))}, coords={"x": [0, 1, 2]})
>>> ds.pad(x=([-1], [3]))
<xarray.Dataset>
Dimensions: (x: 5)
Coordinates:
* x (x) int64 -1 0 1 2 3
Data variables:
foo (x) float64 nan 0.0 1.0 2.0 nan
"""
pad_width = either_dict_or_kwargs(pad_width, pad_width_kwargs, "pad")

Expand All @@ -6638,8 +6653,25 @@ def pad(
coord_pad_options = {}

variables = {}

# standarize pad_width
pad_width_standardized = {} # type: Dict[Hashable, Tuple[int, int]]
for k, v in pad_width.items():
if not isinstance(v, int):
# if pad_width is a tuple of iterable, we use its length for
# pad_width_standardized
# mypy does not know the length here and infers Tuple[int, ...]
# see https://github.com/python/mypy/issues/7509
pad_width_standardized[k] = tuple( # type: ignore
len(v1) if isinstance(v1, Sequence) else v1 for v1 in v
)
else: # just an int
pad_width_standardized[k] = (v, v)

for name, var in self.variables.items():
var_pad_width = {k: v for k, v in pad_width.items() if k in var.dims}
var_pad_width = {
k: v for k, v in pad_width_standardized.items() if k in var.dims
}
if not var_pad_width:
variables[name] = var
elif name in self.data_vars:
Expand All @@ -6651,6 +6683,20 @@ def pad(
end_values=end_values,
reflect_type=reflect_type,
)
elif name in var_pad_width.keys() and not isinstance(
var_pad_width[name], int
): # dimension coordinates
w0, w1 = pad_width[name] # type: ignore
fill_value_ind = dtypes.get_fill_value(var.dtype)
if isinstance(w0, int):
w0_ = IndexVariable(name, [fill_value_ind] * w0)
else:
w0_ = IndexVariable(name, w0)
if isinstance(w1, int):
w1_ = IndexVariable(name, [fill_value_ind] * w1)
else:
w1_ = IndexVariable(name, w1)
variables[name] = var.concat([w0_, var, w1_], dim=name)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a bit wondering if we put this series of logic should be moved into IndexVariable.pad.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I think that would make sense.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. Done.

else:
variables[name] = var.pad(
pad_width=var_pad_width,
Expand Down
17 changes: 17 additions & 0 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5798,6 +5798,23 @@ def test_pad(self):
np.testing.assert_equal(padded["var1"].isel(dim2=[0, -1]).data, 42)
np.testing.assert_equal(padded["dim2"][[0, -1]].data, np.nan)

def test_pad_index(self):
ds = create_test_data(seed=1)
padded = ds.pad(dim2=([0, 1, 2], 0), constant_values=42)

assert padded["dim2"].shape == (12,)
assert padded["var1"].shape == (8, 12)
assert padded["var2"].shape == (8, 12)
assert padded["var3"].shape == (10, 8)
assert dict(padded.dims) == {"dim1": 8, "dim2": 12, "dim3": 10, "time": 20}
assert np.nan not in padded["dim2"]

padded = ds.pad(dim2=(0, [0, 1, 2]), constant_values=42)
assert np.nan not in padded["dim2"]

padded = ds.pad(dim2=([0, 1], [0, 1, 2]), constant_values=42)
assert np.nan not in padded["dim2"]

def test_astype_attrs(self):
data = create_test_data(seed=123)
data.attrs["foo"] = "bar"
Expand Down