From ffc32755ca98d11208b6403d424ebcc2ba5bc4fa Mon Sep 17 00:00:00 2001 From: keewis Date: Sat, 9 Nov 2019 05:21:26 +0100 Subject: [PATCH 01/22] tests for datasets with units (#3447) * start writing the tests for dataset * add tests for initializing Datasets * add tests for aggregation methods / functions * add tests for the ndarray methods / properties * add tests for missing value handling methods * add tests for comparison methods * add tests for reordering / stacking the test for to_stacked_array seems a bit brittle * add tests for indexing methods * remove the commented out xfail on Dataset.squeeze * add tests for head, tail and thin * add tests for the computation methods * add tests for grouped operations * add tests for the content manipulation methods * fix reindex_like to actually expect errors where appropriate * use DataArray.copy to replicate a DataArray with different data * add tests for repr / str * remove the comment about moving the merge tests * construct a new data array instead of using `copy` which in combination with `assign_coords` make preserving `MultiIndex` instances much more complicated. * update whats-new.rst --- doc/whats-new.rst | 2 +- xarray/tests/test_units.py | 1744 +++++++++++++++++++++++++++++++++++- 2 files changed, 1740 insertions(+), 6 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 04fe88e9993..d2a4b32a71f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -104,7 +104,7 @@ Internal Changes ~~~~~~~~~~~~~~~~ - Added integration tests against `pint `_. - (:pull:`3238`) by `Justus Magin `_. + (:pull:`3238`, :pull:`3447`) by `Justus Magin `_. .. note:: diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index 80063f8b4bc..8eed1f0dbe3 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -123,14 +123,19 @@ def extract_units(obj): def strip_units(obj): if isinstance(obj, xr.Dataset): - data_vars = {name: strip_units(value) for name, value in obj.data_vars.items()} - coords = {name: strip_units(value) for name, value in obj.coords.items()} + data_vars = { + strip_units(name): strip_units(value) + for name, value in obj.data_vars.items() + } + coords = { + strip_units(name): strip_units(value) for name, value in obj.coords.items() + } new_obj = xr.Dataset(data_vars=data_vars, coords=coords) elif isinstance(obj, xr.DataArray): data = array_strip_units(obj.data) coords = { - name: ( + strip_units(name): ( (value.dims, array_strip_units(value.data)) if isinstance(value.data, Quantity) else value # to preserve multiindexes @@ -138,9 +143,13 @@ def strip_units(obj): for name, value in obj.coords.items() } - new_obj = xr.DataArray(name=obj.name, data=data, coords=coords, dims=obj.dims) - elif hasattr(obj, "magnitude"): + new_obj = xr.DataArray( + name=strip_units(obj.name), data=data, coords=coords, dims=obj.dims + ) + elif isinstance(obj, unit_registry.Quantity): new_obj = obj.magnitude + elif isinstance(obj, (list, tuple)): + return type(obj)(strip_units(elem) for elem in obj) else: new_obj = obj @@ -191,6 +200,38 @@ def attach_units(obj, units): return new_obj +def convert_units(obj, to): + if isinstance(obj, xr.Dataset): + data_vars = { + name: convert_units(array, to) for name, array in obj.data_vars.items() + } + coords = {name: convert_units(array, to) for name, array in obj.coords.items()} + + new_obj = xr.Dataset(data_vars=data_vars, coords=coords, attrs=obj.attrs) + elif isinstance(obj, xr.DataArray): + name = obj.name + + new_units = ( + to.get(name, None) or to.get("data", None) or to.get(None, None) or 1 + ) + data = convert_units(obj.data, {None: new_units}) + + coords = { + name: (array.dims, convert_units(array.data, to)) + for name, array in obj.coords.items() + if name != obj.name + } + + new_obj = xr.DataArray(name=name, data=data, coords=coords, attrs=obj.attrs) + elif isinstance(obj, unit_registry.Quantity): + units = to.get(None) + new_obj = obj.to(units) if units is not None else obj + else: + new_obj = obj + + return new_obj + + def assert_equal_with_units(a, b): # works like xr.testing.assert_equal, but also explicitly checks units # so, it is more like assert_identical @@ -1632,3 +1673,1696 @@ def test_grouped_operations(self, func, dtype): result = func(data_array.groupby("y")) assert_equal_with_units(expected, result) + + +class TestDataset: + @pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="same_unit"), + ), + ) + @pytest.mark.parametrize( + "shared", + ( + "nothing", + pytest.param( + "dims", + marks=pytest.mark.xfail(reason="reindex does not work with pint yet"), + ), + pytest.param( + "coords", + marks=pytest.mark.xfail(reason="reindex does not work with pint yet"), + ), + ), + ) + def test_init(self, shared, unit, error, dtype): + original_unit = unit_registry.m + scaled_unit = unit_registry.mm + + a = np.linspace(0, 1, 10).astype(dtype) * unit_registry.Pa + b = np.linspace(-1, 0, 12).astype(dtype) * unit_registry.Pa + + raw_x = np.arange(a.shape[0]) + x = raw_x * original_unit + x2 = x.to(scaled_unit) + + raw_y = np.arange(b.shape[0]) + y = raw_y * unit + y_units = unit if isinstance(y, unit_registry.Quantity) else None + if isinstance(y, unit_registry.Quantity): + if y.check(scaled_unit): + y2 = y.to(scaled_unit) + else: + y2 = y * 1000 + y2_units = y2.units + else: + y2 = y * 1000 + y2_units = None + + variants = { + "nothing": ({"x": x, "x2": ("x", x2)}, {"y": y, "y2": ("y", y2)}), + "dims": ( + {"x": x, "x2": ("x", strip_units(x2))}, + {"x": y, "y2": ("x", strip_units(y2))}, + ), + "coords": ({"x": raw_x, "y": ("x", x2)}, {"x": raw_y, "y": ("x", y2)}), + } + coords_a, coords_b = variants.get(shared) + + dims_a, dims_b = ("x", "y") if shared == "nothing" else ("x", "x") + + arr1 = xr.DataArray(data=a, coords=coords_a, dims=dims_a) + arr2 = xr.DataArray(data=b, coords=coords_b, dims=dims_b) + if error is not None and shared != "nothing": + with pytest.raises(error): + xr.Dataset(data_vars={"a": arr1, "b": arr2}) + + return + + result = xr.Dataset(data_vars={"a": arr1, "b": arr2}) + + expected_units = { + "a": a.units, + "b": b.units, + "x": x.units, + "x2": x2.units, + "y": y_units, + "y2": y2_units, + } + expected = attach_units( + xr.Dataset(data_vars={"a": strip_units(arr1), "b": strip_units(arr2)}), + expected_units, + ) + assert_equal_with_units(result, expected) + + @pytest.mark.parametrize( + "func", (pytest.param(str, id="str"), pytest.param(repr, id="repr")) + ) + @pytest.mark.parametrize( + "variant", + ( + pytest.param( + "with_dims", + marks=pytest.mark.xfail(reason="units in indexes are not supported"), + ), + pytest.param("with_coords"), + pytest.param("without_coords"), + ), + ) + @pytest.mark.filterwarnings("error:::pint[.*]") + def test_repr(self, func, variant, dtype): + array1 = np.linspace(1, 2, 10, dtype=dtype) * unit_registry.Pa + array2 = np.linspace(0, 1, 10, dtype=dtype) * unit_registry.degK + + x = np.arange(len(array1)) * unit_registry.s + y = x.to(unit_registry.ms) + + variants = { + "with_dims": {"x": x}, + "with_coords": {"y": ("x", y)}, + "without_coords": {}, + } + + data_array = xr.Dataset( + data_vars={"a": ("x", array1), "b": ("x", array2)}, + coords=variants.get(variant), + ) + + # FIXME: this just checks that the repr does not raise + # warnings or errors, but does not check the result + func(data_array) + + @pytest.mark.parametrize( + "func", + ( + pytest.param( + function("all"), + marks=pytest.mark.xfail(reason="not implemented by pint"), + ), + pytest.param( + function("any"), + marks=pytest.mark.xfail(reason="not implemented by pint"), + ), + function("argmax"), + function("argmin"), + function("max"), + function("min"), + function("mean"), + pytest.param( + function("median"), + marks=pytest.mark.xfail( + reason="np.median does not work with dataset yet" + ), + ), + pytest.param( + function("sum"), + marks=pytest.mark.xfail( + reason="np.result_type not implemented by pint" + ), + ), + pytest.param( + function("prod"), + marks=pytest.mark.xfail(reason="not implemented by pint"), + ), + function("std"), + function("var"), + function("cumsum"), + pytest.param( + function("cumprod"), + marks=pytest.mark.xfail( + reason="pint does not support cumprod on non-dimensionless yet" + ), + ), + pytest.param( + method("all"), marks=pytest.mark.xfail(reason="not implemented by pint") + ), + pytest.param( + method("any"), marks=pytest.mark.xfail(reason="not implemented by pint") + ), + method("argmax"), + method("argmin"), + method("max"), + method("min"), + method("mean"), + method("median"), + pytest.param( + method("sum"), + marks=pytest.mark.xfail( + reason="np.result_type not implemented by pint" + ), + ), + pytest.param( + method("prod"), + marks=pytest.mark.xfail(reason="not implemented by pint"), + ), + method("std"), + method("var"), + method("cumsum"), + pytest.param( + method("cumprod"), + marks=pytest.mark.xfail( + reason="pint does not support cumprod on non-dimensionless yet" + ), + ), + ), + ids=repr, + ) + def test_aggregation(self, func, dtype): + unit_a = unit_registry.Pa + unit_b = unit_registry.kg / unit_registry.m ** 3 + a = xr.DataArray(data=np.linspace(0, 1, 10).astype(dtype) * unit_a, dims="x") + b = xr.DataArray(data=np.linspace(-1, 0, 10).astype(dtype) * unit_b, dims="x") + x = xr.DataArray(data=np.arange(10).astype(dtype) * unit_registry.m, dims="x") + y = xr.DataArray( + data=np.arange(10, 20).astype(dtype) * unit_registry.s, dims="x" + ) + + ds = xr.Dataset(data_vars={"a": a, "b": b}, coords={"x": x, "y": y}) + + result = func(ds) + expected = attach_units( + func(strip_units(ds)), + {"a": array_extract_units(func(a)), "b": array_extract_units(func(b))}, + ) + + assert_equal_with_units(result, expected) + + @pytest.mark.parametrize("property", ("imag", "real")) + def test_numpy_properties(self, property, dtype): + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray( + data=np.linspace(0, 1, 10) * unit_registry.Pa, dims="x" + ), + "b": xr.DataArray( + data=np.linspace(-1, 0, 15) * unit_registry.Pa, dims="y" + ), + }, + coords={ + "x": np.arange(10) * unit_registry.m, + "y": np.arange(15) * unit_registry.s, + }, + ) + units = extract_units(ds) + + result = getattr(ds, property) + expected = attach_units(getattr(strip_units(ds), property), units) + + assert_equal_with_units(result, expected) + + @pytest.mark.parametrize( + "func", + ( + method("astype", float), + method("conj"), + method("argsort"), + method("conjugate"), + method("round"), + pytest.param( + method("rank", dim="x"), + marks=pytest.mark.xfail(reason="pint does not implement rank yet"), + ), + ), + ids=repr, + ) + def test_numpy_methods(self, func, dtype): + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray( + data=np.linspace(1, -1, 10) * unit_registry.Pa, dims="x" + ), + "b": xr.DataArray( + data=np.linspace(-1, 1, 15) * unit_registry.Pa, dims="y" + ), + }, + coords={ + "x": np.arange(10) * unit_registry.m, + "y": np.arange(15) * unit_registry.s, + }, + ) + units = { + "a": array_extract_units(func(ds.a)), + "b": array_extract_units(func(ds.b)), + "x": unit_registry.m, + "y": unit_registry.s, + } + + result = func(ds) + expected = attach_units(func(strip_units(ds)), units) + + assert_equal_with_units(result, expected) + + @pytest.mark.parametrize("func", (method("clip", min=3, max=8),), ids=repr) + @pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.cm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ) + def test_numpy_methods_with_args(self, func, unit, error, dtype): + data_unit = unit_registry.m + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=np.arange(10) * data_unit, dims="x"), + "b": xr.DataArray(data=np.arange(15) * data_unit, dims="y"), + }, + coords={ + "x": np.arange(10) * unit_registry.m, + "y": np.arange(15) * unit_registry.s, + }, + ) + units = extract_units(ds) + + def strip(value): + return ( + value.magnitude if isinstance(value, unit_registry.Quantity) else value + ) + + def convert(value, to): + if isinstance(value, unit_registry.Quantity) and value.check(to): + return value.to(to) + + return value + + scalar_types = (int, float) + kwargs = { + key: (value * unit if isinstance(value, scalar_types) else value) + for key, value in func.kwargs.items() + } + + stripped_kwargs = { + key: strip(convert(value, data_unit)) for key, value in kwargs.items() + } + + if error is not None: + with pytest.raises(error): + func(ds, **kwargs) + + return + + result = func(ds, **kwargs) + expected = attach_units(func(strip_units(ds), **stripped_kwargs), units) + + assert_equal_with_units(result, expected) + + @pytest.mark.parametrize( + "func", (method("isnull"), method("notnull"), method("count")), ids=repr + ) + def test_missing_value_detection(self, func, dtype): + array1 = ( + np.array( + [ + [1.4, 2.3, np.nan, 7.2], + [np.nan, 9.7, np.nan, np.nan], + [2.1, np.nan, np.nan, 4.6], + [9.9, np.nan, 7.2, 9.1], + ] + ) + * unit_registry.degK + ) + array2 = ( + np.array( + [ + [np.nan, 5.7, 12.0, 7.2], + [np.nan, 12.4, np.nan, 4.2], + [9.8, np.nan, 4.6, 1.4], + [7.2, np.nan, 6.3, np.nan], + [8.4, 3.9, np.nan, np.nan], + ] + ) + * unit_registry.Pa + ) + + x = np.arange(array1.shape[0]) * unit_registry.m + y = np.arange(array1.shape[1]) * unit_registry.m + z = np.arange(array2.shape[0]) * unit_registry.m + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims=("x", "y")), + "b": xr.DataArray(data=array2, dims=("z", "x")), + }, + coords={"x": x, "y": y, "z": z}, + ) + + expected = func(strip_units(ds)) + result = func(ds) + + assert_equal_with_units(expected, result) + + @pytest.mark.xfail(reason="ffill and bfill lose the unit") + @pytest.mark.parametrize("func", (method("ffill"), method("bfill")), ids=repr) + def test_missing_value_filling(self, func, dtype): + array1 = ( + np.array([1.4, np.nan, 2.3, np.nan, np.nan, 9.1]).astype(dtype) + * unit_registry.degK + ) + array2 = ( + np.array([4.3, 9.8, 7.5, np.nan, 8.2, np.nan]).astype(dtype) + * unit_registry.Pa + ) + + x = np.arange(len(array1)) + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims="x"), + "b": xr.DataArray(data=array2, dims="x"), + }, + coords={"x": x}, + ) + + expected = attach_units( + func(strip_units(ds), dim="x"), + {"a": unit_registry.degK, "b": unit_registry.Pa}, + ) + result = func(ds, dim="x") + + assert_equal_with_units(expected, result) + + @pytest.mark.xfail(reason="fillna drops the unit") + @pytest.mark.parametrize( + "unit,error", + ( + pytest.param( + 1, + DimensionalityError, + id="no_unit", + marks=pytest.mark.xfail(reason="blocked by the failing `where`"), + ), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.cm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ) + @pytest.mark.parametrize( + "fill_value", + ( + pytest.param( + -1, + id="python scalar", + marks=pytest.mark.xfail( + reason="python scalar cannot be converted using astype()" + ), + ), + pytest.param(np.array(-1), id="numpy scalar"), + pytest.param(np.array([-1]), id="numpy array"), + ), + ) + def test_fillna(self, fill_value, unit, error, dtype): + array1 = ( + np.array([1.4, np.nan, 2.3, np.nan, np.nan, 9.1]).astype(dtype) + * unit_registry.m + ) + array2 = ( + np.array([4.3, 9.8, 7.5, np.nan, 8.2, np.nan]).astype(dtype) + * unit_registry.m + ) + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims="x"), + "b": xr.DataArray(data=array2, dims="x"), + } + ) + + if error is not None: + with pytest.raises(error): + ds.fillna(value=fill_value * unit) + + return + + result = ds.fillna(value=fill_value * unit) + expected = attach_units( + strip_units(ds).fillna(value=fill_value), + {"a": unit_registry.m, "b": unit_registry.m}, + ) + + assert_equal_with_units(expected, result) + + def test_dropna(self, dtype): + array1 = ( + np.array([1.4, np.nan, 2.3, np.nan, np.nan, 9.1]).astype(dtype) + * unit_registry.degK + ) + array2 = ( + np.array([4.3, 9.8, 7.5, np.nan, 8.2, np.nan]).astype(dtype) + * unit_registry.Pa + ) + x = np.arange(len(array1)) + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims="x"), + "b": xr.DataArray(data=array2, dims="x"), + }, + coords={"x": x}, + ) + + expected = attach_units( + strip_units(ds).dropna(dim="x"), + {"a": unit_registry.degK, "b": unit_registry.Pa}, + ) + result = ds.dropna(dim="x") + + assert_equal_with_units(expected, result) + + @pytest.mark.xfail(reason="pint does not implement `numpy.isin`") + @pytest.mark.parametrize( + "unit", + ( + pytest.param(1, id="no_unit"), + pytest.param(unit_registry.dimensionless, id="dimensionless"), + pytest.param(unit_registry.s, id="incompatible_unit"), + pytest.param(unit_registry.cm, id="compatible_unit"), + pytest.param(unit_registry.m, id="same_unit"), + ), + ) + def test_isin(self, unit, dtype): + array1 = ( + np.array([1.4, np.nan, 2.3, np.nan, np.nan, 9.1]).astype(dtype) + * unit_registry.m + ) + array2 = ( + np.array([4.3, 9.8, 7.5, np.nan, 8.2, np.nan]).astype(dtype) + * unit_registry.m + ) + x = np.arange(len(array1)) + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims="x"), + "b": xr.DataArray(data=array2, dims="x"), + }, + coords={"x": x}, + ) + + raw_values = np.array([1.4, np.nan, 2.3]).astype(dtype) + values = raw_values * unit + + if ( + isinstance(values, unit_registry.Quantity) + and values.check(unit_registry.m) + and unit != unit_registry.m + ): + raw_values = values.to(unit_registry.m).magnitude + + expected = strip_units(ds).isin(raw_values) + if not isinstance(values, unit_registry.Quantity) or not values.check( + unit_registry.m + ): + expected.a[:] = False + expected.b[:] = False + result = ds.isin(values) + + assert_equal_with_units(result, expected) + + @pytest.mark.parametrize( + "variant", + ( + pytest.param( + "masking", + marks=pytest.mark.xfail( + reason="np.result_type not implemented by quantity" + ), + ), + pytest.param( + "replacing_scalar", + marks=pytest.mark.xfail( + reason="python scalar not convertible using astype" + ), + ), + pytest.param( + "replacing_array", + marks=pytest.mark.xfail( + reason="replacing using an array drops the units" + ), + ), + pytest.param( + "dropping", + marks=pytest.mark.xfail(reason="nan not compatible with quantity"), + ), + ), + ) + @pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.cm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="same_unit"), + ), + ) + def test_where(self, variant, unit, error, dtype): + def _strip_units(mapping): + return {key: array_strip_units(value) for key, value in mapping.items()} + + original_unit = unit_registry.m + array1 = np.linspace(0, 1, 10).astype(dtype) * original_unit + array2 = np.linspace(-1, 0, 10).astype(dtype) * original_unit + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims="x"), + "b": xr.DataArray(data=array2, dims="x"), + }, + coords={"x": np.arange(len(array1))}, + ) + + condition = ds < 0.5 * original_unit + other = np.linspace(-2, -1, 10).astype(dtype) * unit + variant_kwargs = { + "masking": {"cond": condition}, + "replacing_scalar": {"cond": condition, "other": -1 * unit}, + "replacing_array": {"cond": condition, "other": other}, + "dropping": {"cond": condition, "drop": True}, + } + kwargs = variant_kwargs.get(variant) + kwargs_without_units = _strip_units(kwargs) + + if variant not in ("masking", "dropping") and error is not None: + with pytest.raises(error): + ds.where(**kwargs) + + return + + expected = attach_units( + strip_units(ds).where(**kwargs_without_units), + {"a": original_unit, "b": original_unit}, + ) + result = ds.where(**kwargs) + + assert_equal_with_units(expected, result) + + @pytest.mark.xfail(reason="interpolate strips units") + def test_interpolate_na(self, dtype): + array1 = ( + np.array([1.4, np.nan, 2.3, np.nan, np.nan, 9.1]).astype(dtype) + * unit_registry.degK + ) + array2 = ( + np.array([4.3, 9.8, 7.5, np.nan, 8.2, np.nan]).astype(dtype) + * unit_registry.Pa + ) + x = np.arange(len(array1)) + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims="x"), + "b": xr.DataArray(data=array2, dims="x"), + }, + coords={"x": x}, + ) + + expected = attach_units( + strip_units(ds).interpolate_na(dim="x"), + {"a": unit_registry.degK, "b": unit_registry.Pa}, + ) + result = ds.interpolate_na(dim="x") + + assert_equal_with_units(expected, result) + + @pytest.mark.xfail(reason="uses Dataset.where, which currently fails") + @pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.cm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="same_unit"), + ), + ) + def test_combine_first(self, unit, error, dtype): + array1 = ( + np.array([1.4, np.nan, 2.3, np.nan, np.nan, 9.1]).astype(dtype) + * unit_registry.degK + ) + array2 = ( + np.array([4.3, 9.8, 7.5, np.nan, 8.2, np.nan]).astype(dtype) + * unit_registry.Pa + ) + x = np.arange(len(array1)) + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims="x"), + "b": xr.DataArray(data=array2, dims="x"), + }, + coords={"x": x}, + ) + other_array1 = np.ones_like(array1) * unit + other_array2 = -1 * np.ones_like(array2) * unit + other = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=other_array1, dims="x"), + "b": xr.DataArray(data=other_array2, dims="x"), + }, + coords={"x": np.arange(array1.shape[0])}, + ) + + if error is not None: + with pytest.raises(error): + ds.combine_first(other) + + return + + expected = attach_units( + strip_units(ds).combine_first(strip_units(other)), + {"a": unit_registry.m, "b": unit_registry.m}, + ) + result = ds.combine_first(other) + + assert_equal_with_units(expected, result) + + @pytest.mark.parametrize( + "unit", + ( + pytest.param(1, id="no_unit"), + pytest.param(unit_registry.dimensionless, id="dimensionless"), + pytest.param(unit_registry.s, id="incompatible_unit"), + pytest.param( + unit_registry.cm, + id="compatible_unit", + marks=pytest.mark.xfail(reason="identical does not check units yet"), + ), + pytest.param(unit_registry.m, id="identical_unit"), + ), + ) + @pytest.mark.parametrize( + "variation", + ( + "data", + pytest.param( + "dims", marks=pytest.mark.xfail(reason="units in indexes not supported") + ), + "coords", + ), + ) + @pytest.mark.parametrize("func", (method("equals"), method("identical")), ids=repr) + def test_comparisons(self, func, variation, unit, dtype): + array1 = np.linspace(0, 5, 10).astype(dtype) + array2 = np.linspace(-5, 0, 10).astype(dtype) + + coord = np.arange(len(array1)).astype(dtype) + + original_unit = unit_registry.m + quantity1 = array1 * original_unit + quantity2 = array2 * original_unit + x = coord * original_unit + y = coord * original_unit + + units = { + "data": (unit, original_unit, original_unit), + "dims": (original_unit, unit, original_unit), + "coords": (original_unit, original_unit, unit), + } + data_unit, dim_unit, coord_unit = units.get(variation) + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=quantity1, dims="x"), + "b": xr.DataArray(data=quantity2, dims="x"), + }, + coords={"x": x, "y": ("x", y)}, + ) + + other = attach_units( + strip_units(ds), + { + "a": (data_unit, original_unit if quantity1.check(data_unit) else None), + "b": (data_unit, original_unit if quantity2.check(data_unit) else None), + "x": (dim_unit, original_unit if x.check(dim_unit) else None), + "y": (coord_unit, original_unit if y.check(coord_unit) else None), + }, + ) + + # TODO: test dim coord once indexes leave units intact + # also, express this in terms of calls on the raw data array + # and then check the units + equal_arrays = ( + np.all(ds.a.data == other.a.data) + and np.all(ds.b.data == other.b.data) + and (np.all(x == other.x.data) or True) # dims can't be checked yet + and np.all(y == other.y.data) + ) + equal_units = ( + data_unit == original_unit + and coord_unit == original_unit + and dim_unit == original_unit + ) + expected = equal_arrays and (func.name != "identical" or equal_units) + result = func(ds, other) + + assert expected == result + + @pytest.mark.parametrize( + "unit", + ( + pytest.param(1, id="no_unit"), + pytest.param(unit_registry.dimensionless, id="dimensionless"), + pytest.param(unit_registry.s, id="incompatible_unit"), + pytest.param(unit_registry.cm, id="compatible_unit"), + pytest.param(unit_registry.m, id="identical_unit"), + ), + ) + def test_broadcast_equals(self, unit, dtype): + left_array1 = np.ones(shape=(2, 3), dtype=dtype) * unit_registry.m + left_array2 = np.zeros(shape=(2, 6), dtype=dtype) * unit_registry.m + + right_array1 = array_attach_units( + np.ones(shape=(2,), dtype=dtype), + unit, + convert_from=unit_registry.m if left_array1.check(unit) else None, + ) + right_array2 = array_attach_units( + np.ones(shape=(2,), dtype=dtype), + unit, + convert_from=unit_registry.m if left_array2.check(unit) else None, + ) + + left = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=left_array1, dims=("x", "y")), + "b": xr.DataArray(data=left_array2, dims=("x", "z")), + } + ) + right = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=right_array1, dims="x"), + "b": xr.DataArray(data=right_array2, dims="x"), + } + ) + + expected = np.all(left_array1 == right_array1[:, None]) and np.all( + left_array2 == right_array2[:, None] + ) + result = left.broadcast_equals(right) + + assert expected == result + + @pytest.mark.parametrize( + "func", + (method("unstack"), method("reset_index", "v"), method("reorder_levels")), + ids=repr, + ) + def test_stacking_stacked(self, func, dtype): + array1 = ( + np.linspace(0, 10, 5 * 10).reshape(5, 10).astype(dtype) * unit_registry.m + ) + array2 = ( + np.linspace(-10, 0, 5 * 10 * 15).reshape(5, 10, 15).astype(dtype) + * unit_registry.m + ) + + x = np.arange(array1.shape[0]) + y = np.arange(array1.shape[1]) + z = np.arange(array2.shape[2]) + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims=("x", "y")), + "b": xr.DataArray(data=array2, dims=("x", "y", "z")), + }, + coords={"x": x, "y": y, "z": z}, + ) + + stacked = ds.stack(v=("x", "y")) + + expected = attach_units( + func(strip_units(stacked)), {"a": unit_registry.m, "b": unit_registry.m} + ) + result = func(stacked) + + assert_equal_with_units(expected, result) + + @pytest.mark.xfail(reason="tries to subscript scalar quantities") + def test_to_stacked_array(self, dtype): + labels = np.arange(5).astype(dtype) * unit_registry.s + arrays = {name: np.linspace(0, 1, 10) * unit_registry.m for name in labels} + + ds = xr.Dataset( + data_vars={ + name: xr.DataArray(data=array, dims="x") + for name, array in arrays.items() + } + ) + + func = method("to_stacked_array", "z", variable_dim="y", sample_dims=["x"]) + + result = func(ds).rename(None) + expected = attach_units( + func(strip_units(ds)).rename(None), + {None: unit_registry.m, "y": unit_registry.s}, + ) + + assert_equal_with_units(expected, result) + + @pytest.mark.parametrize( + "func", + ( + method("transpose", "y", "x", "z1", "z2"), + method("stack", a=("x", "y")), + method("set_index", x="x2"), + pytest.param( + method("shift", x=2), marks=pytest.mark.xfail(reason="sets all to nan") + ), + pytest.param( + method("roll", x=2, roll_coords=False), + marks=pytest.mark.xfail(reason="strips units"), + ), + method("sortby", "x2"), + ), + ids=repr, + ) + def test_stacking_reordering(self, func, dtype): + array1 = ( + np.linspace(0, 10, 2 * 5 * 10).reshape(2, 5, 10).astype(dtype) + * unit_registry.Pa + ) + array2 = ( + np.linspace(0, 10, 2 * 5 * 15).reshape(2, 5, 15).astype(dtype) + * unit_registry.degK + ) + + x = np.arange(array1.shape[0]) + y = np.arange(array1.shape[1]) + z1 = np.arange(array1.shape[2]) + z2 = np.arange(array2.shape[2]) + + x2 = np.linspace(0, 1, array1.shape[0])[::-1] + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims=("x", "y", "z1")), + "b": xr.DataArray(data=array2, dims=("x", "y", "z2")), + }, + coords={"x": x, "y": y, "z1": z1, "z2": z2, "x2": ("x", x2)}, + ) + + expected = attach_units( + func(strip_units(ds)), {"a": unit_registry.Pa, "b": unit_registry.degK} + ) + result = func(ds) + + assert_equal_with_units(expected, result) + + @pytest.mark.xfail(reason="indexes strip units") + @pytest.mark.parametrize( + "indices", + ( + pytest.param(4, id="single index"), + pytest.param([5, 2, 9, 1], id="multiple indices"), + ), + ) + def test_isel(self, indices, dtype): + array1 = np.arange(10).astype(dtype) * unit_registry.s + array2 = np.linspace(0, 1, 10).astype(dtype) * unit_registry.Pa + + x = np.arange(len(array1)) * unit_registry.m + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims="x"), + "b": xr.DataArray(data=array2, dims="x"), + }, + coords={"x": x}, + ) + + expected = attach_units( + strip_units(ds).isel(x=indices), + {"a": unit_registry.s, "b": unit_registry.Pa, "x": unit_registry.m}, + ) + result = ds.isel(x=indices) + + assert_equal_with_units(expected, result) + + @pytest.mark.xfail( + reason="xarray does not support duck arrays in dimension coordinates" + ) + @pytest.mark.parametrize( + "values", + ( + pytest.param(12, id="single_value"), + pytest.param([10, 5, 13], id="list_of_values"), + pytest.param(np.array([9, 3, 7, 12]), id="array_of_values"), + ), + ) + @pytest.mark.parametrize( + "units,error", + ( + pytest.param(1, KeyError, id="no_units"), + pytest.param(unit_registry.dimensionless, KeyError, id="dimensionless"), + pytest.param(unit_registry.degree, KeyError, id="incompatible_unit"), + pytest.param(unit_registry.ms, KeyError, id="compatible_unit"), + pytest.param(unit_registry.s, None, id="same_unit"), + ), + ) + def test_sel(self, values, units, error, dtype): + array1 = np.linspace(5, 10, 20).astype(dtype) * unit_registry.degK + array2 = np.linspace(0, 5, 20).astype(dtype) * unit_registry.Pa + x = np.arange(len(array1)) * unit_registry.s + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims="x"), + "b": xr.DataArray(data=array2, dims="x"), + }, + coords={"x": x}, + ) + + values_with_units = values * units + + if error is not None: + with pytest.raises(error): + ds.sel(x=values_with_units) + + return + + expected = attach_units( + strip_units(ds).sel(x=values), + {"a": unit_registry.degK, "b": unit_registry.Pa, "x": unit_registry.s}, + ) + result = ds.sel(x=values_with_units) + assert_equal_with_units(expected, result) + + @pytest.mark.xfail( + reason="xarray does not support duck arrays in dimension coordinates" + ) + @pytest.mark.parametrize( + "values", + ( + pytest.param(12, id="single value"), + pytest.param([10, 5, 13], id="list of multiple values"), + pytest.param(np.array([9, 3, 7, 12]), id="array of multiple values"), + ), + ) + @pytest.mark.parametrize( + "units,error", + ( + pytest.param(1, KeyError, id="no_units"), + pytest.param(unit_registry.dimensionless, KeyError, id="dimensionless"), + pytest.param(unit_registry.degree, KeyError, id="incompatible_unit"), + pytest.param(unit_registry.ms, KeyError, id="compatible_unit"), + pytest.param(unit_registry.s, None, id="same_unit"), + ), + ) + def test_loc(self, values, units, error, dtype): + array1 = np.linspace(5, 10, 20).astype(dtype) * unit_registry.degK + array2 = np.linspace(0, 5, 20).astype(dtype) * unit_registry.Pa + x = np.arange(len(array1)) * unit_registry.s + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims="x"), + "b": xr.DataArray(data=array2, dims="x"), + }, + coords={"x": x}, + ) + + values_with_units = values * units + + if error is not None: + with pytest.raises(error): + ds.loc[{"x": values_with_units}] + + return + + expected = attach_units( + strip_units(ds).loc[{"x": values}], + {"a": unit_registry.degK, "b": unit_registry.Pa, "x": unit_registry.s}, + ) + result = ds.loc[{"x": values_with_units}] + assert_equal_with_units(expected, result) + + @pytest.mark.xfail( + reason="indexes strip units and head / tail / thin only support integers" + ) + @pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.cm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ) + @pytest.mark.parametrize( + "func", + ( + method("head", x=7, y=3, z=6), + method("tail", x=7, y=3, z=6), + method("thin", x=7, y=3, z=6), + ), + ids=repr, + ) + def test_head_tail_thin(self, func, unit, error, dtype): + array1 = np.linspace(1, 2, 10 * 5).reshape(10, 5) * unit_registry.degK + array2 = np.linspace(1, 2, 10 * 8).reshape(10, 8) * unit_registry.Pa + + coords = { + "x": np.arange(10) * unit_registry.m, + "y": np.arange(5) * unit_registry.m, + "z": np.arange(8) * unit_registry.m, + } + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims=("x", "y")), + "b": xr.DataArray(data=array2, dims=("x", "z")), + }, + coords=coords, + ) + + kwargs = {name: value * unit for name, value in func.kwargs.items()} + + if error is not None: + with pytest.raises(error): + func(ds, **kwargs) + + return + + expected = attach_units(func(strip_units(ds)), extract_units(ds)) + result = func(ds, **kwargs) + + assert_equal_with_units(expected, result) + + @pytest.mark.parametrize( + "shape", + ( + pytest.param((10, 20), id="nothing squeezable"), + pytest.param((10, 20, 1), id="last dimension squeezable"), + pytest.param((10, 1, 20), id="middle dimension squeezable"), + pytest.param((1, 10, 20), id="first dimension squeezable"), + pytest.param((1, 10, 1, 20), id="first and last dimension squeezable"), + ), + ) + def test_squeeze(self, shape, dtype): + names = "xyzt" + coords = { + name: np.arange(length).astype(dtype) + * (unit_registry.m if name != "t" else unit_registry.s) + for name, length in zip(names, shape) + } + array1 = ( + np.linspace(0, 1, 10 * 20).astype(dtype).reshape(shape) * unit_registry.degK + ) + array2 = ( + np.linspace(1, 2, 10 * 20).astype(dtype).reshape(shape) * unit_registry.Pa + ) + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims=tuple(names[: len(shape)])), + "b": xr.DataArray(data=array2, dims=tuple(names[: len(shape)])), + }, + coords=coords, + ) + units = extract_units(ds) + + expected = attach_units(strip_units(ds).squeeze(), units) + + result = ds.squeeze() + assert_equal_with_units(result, expected) + + # try squeezing the dimensions separately + names = tuple(dim for dim, coord in coords.items() if len(coord) == 1) + for name in names: + expected = attach_units(strip_units(ds).squeeze(dim=name), units) + result = ds.squeeze(dim=name) + assert_equal_with_units(result, expected) + + @pytest.mark.xfail(reason="ignores units") + @pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.cm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ) + def test_interp(self, unit, error): + array1 = np.linspace(1, 2, 10 * 5).reshape(10, 5) * unit_registry.degK + array2 = np.linspace(1, 2, 10 * 8).reshape(10, 8) * unit_registry.Pa + + coords = { + "x": np.arange(10) * unit_registry.m, + "y": np.arange(5) * unit_registry.m, + "z": np.arange(8) * unit_registry.s, + } + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims=("x", "y")), + "b": xr.DataArray(data=array2, dims=("x", "z")), + }, + coords=coords, + ) + + new_coords = (np.arange(10) + 0.5) * unit + + if error is not None: + with pytest.raises(error): + ds.interp(x=new_coords) + + return + + expected = attach_units( + strip_units(ds).interp(x=strip_units(new_coords)), extract_units(ds) + ) + result = ds.interp(x=new_coords) + + assert_equal_with_units(result, expected) + + @pytest.mark.xfail(reason="ignores units") + @pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.cm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ) + def test_interp_like(self, unit, error, dtype): + array1 = ( + np.linspace(0, 10, 10 * 5).reshape(10, 5).astype(dtype) * unit_registry.degK + ) + array2 = ( + np.linspace(10, 20, 10 * 8).reshape(10, 8).astype(dtype) * unit_registry.Pa + ) + + coords = { + "x": np.arange(10) * unit_registry.m, + "y": np.arange(5) * unit_registry.m, + "z": np.arange(8) * unit_registry.m, + } + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims=("x", "y")), + "b": xr.DataArray(data=array2, dims=("x", "z")), + }, + coords=coords, + ) + + other = xr.Dataset( + data_vars={ + "c": xr.DataArray(data=np.empty((20, 10)), dims=("x", "y")), + "d": xr.DataArray(data=np.empty((20, 15)), dims=("x", "z")), + }, + coords={ + "x": (np.arange(20) + 0.3) * unit, + "y": (np.arange(10) - 0.2) * unit, + "z": (np.arange(15) + 0.4) * unit, + }, + ) + + if error is not None: + with pytest.raises(error): + ds.interp_like(other) + + return + + expected = attach_units( + strip_units(ds).interp_like(strip_units(other)), extract_units(ds) + ) + result = ds.interp_like(other) + + assert_equal_with_units(result, expected) + + @pytest.mark.xfail( + reason="pint does not implement np.result_type in __array_function__ yet" + ) + @pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.cm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ) + def test_reindex(self, unit, error): + array1 = np.linspace(1, 2, 10 * 5).reshape(10, 5) * unit_registry.degK + array2 = np.linspace(1, 2, 10 * 8).reshape(10, 8) * unit_registry.Pa + + coords = { + "x": np.arange(10) * unit_registry.m, + "y": np.arange(5) * unit_registry.m, + "z": np.arange(8) * unit_registry.s, + } + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims=("x", "y")), + "b": xr.DataArray(data=array2, dims=("x", "z")), + }, + coords=coords, + ) + + new_coords = (np.arange(10) + 0.5) * unit + + if error is not None: + with pytest.raises(error): + ds.interp(x=new_coords) + + return + + expected = attach_units( + strip_units(ds).reindex(x=strip_units(new_coords)), extract_units(ds) + ) + result = ds.reindex(x=new_coords) + + assert_equal_with_units(result, expected) + + @pytest.mark.xfail( + reason="pint does not implement np.result_type in __array_function__ yet" + ) + @pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.cm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ) + def test_reindex_like(self, unit, error, dtype): + array1 = ( + np.linspace(0, 10, 10 * 5).reshape(10, 5).astype(dtype) * unit_registry.degK + ) + array2 = ( + np.linspace(10, 20, 10 * 8).reshape(10, 8).astype(dtype) * unit_registry.Pa + ) + + coords = { + "x": np.arange(10) * unit_registry.m, + "y": np.arange(5) * unit_registry.m, + "z": np.arange(8) * unit_registry.m, + } + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims=("x", "y")), + "b": xr.DataArray(data=array2, dims=("x", "z")), + }, + coords=coords, + ) + + other = xr.Dataset( + data_vars={ + "c": xr.DataArray(data=np.empty((20, 10)), dims=("x", "y")), + "d": xr.DataArray(data=np.empty((20, 15)), dims=("x", "z")), + }, + coords={ + "x": (np.arange(20) + 0.3) * unit, + "y": (np.arange(10) - 0.2) * unit, + "z": (np.arange(15) + 0.4) * unit, + }, + ) + + if error is not None: + with pytest.raises(error): + ds.reindex_like(other) + + return + + expected = attach_units( + strip_units(ds).reindex_like(strip_units(other)), extract_units(ds) + ) + result = ds.reindex_like(other) + + assert_equal_with_units(result, expected) + + @pytest.mark.parametrize( + "func", + ( + method("diff", dim="x"), + method("differentiate", coord="x"), + method("integrate", coord="x"), + pytest.param( + method("quantile", q=[0.25, 0.75]), + marks=pytest.mark.xfail( + reason="pint does not implement nanpercentile yet" + ), + ), + pytest.param( + method("reduce", func=np.sum, dim="x"), + marks=pytest.mark.xfail(reason="strips units"), + ), + pytest.param( + method("apply", np.fabs), + marks=pytest.mark.xfail(reason="fabs strips units"), + ), + ), + ids=repr, + ) + def test_computation(self, func, dtype): + array1 = ( + np.linspace(-5, 5, 10 * 5).reshape(10, 5).astype(dtype) * unit_registry.degK + ) + array2 = ( + np.linspace(10, 20, 10 * 8).reshape(10, 8).astype(dtype) * unit_registry.Pa + ) + x = np.arange(10) * unit_registry.m + y = np.arange(5) * unit_registry.m + z = np.arange(8) * unit_registry.m + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims=("x", "y")), + "b": xr.DataArray(data=array2, dims=("x", "z")), + }, + coords={"x": x, "y": y, "z": z}, + ) + + units = extract_units(ds) + + expected = attach_units(func(strip_units(ds)), units) + result = func(ds) + + assert_equal_with_units(expected, result) + + @pytest.mark.parametrize( + "func", + ( + pytest.param( + method("groupby", "x"), marks=pytest.mark.xfail(reason="strips units") + ), + pytest.param( + method("groupby_bins", "x", bins=4), + marks=pytest.mark.xfail(reason="strips units"), + ), + method("coarsen", x=2), + pytest.param( + method("rolling", x=3), marks=pytest.mark.xfail(reason="strips units") + ), + pytest.param( + method("rolling_exp", x=3), + marks=pytest.mark.xfail(reason="strips units"), + ), + ), + ids=repr, + ) + def test_computation_objects(self, func, dtype): + array1 = ( + np.linspace(-5, 5, 10 * 5).reshape(10, 5).astype(dtype) * unit_registry.degK + ) + array2 = ( + np.linspace(10, 20, 10 * 5 * 8).reshape(10, 5, 8).astype(dtype) + * unit_registry.Pa + ) + x = np.arange(10) * unit_registry.m + y = np.arange(5) * unit_registry.m + z = np.arange(8) * unit_registry.m + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims=("x", "y")), + "b": xr.DataArray(data=array2, dims=("x", "y", "z")), + }, + coords={"x": x, "y": y, "z": z}, + ) + units = extract_units(ds) + + args = [] if func.name != "groupby" else ["y"] + reduce_func = method("mean", *args) + expected = attach_units(reduce_func(func(strip_units(ds))), units) + result = reduce_func(func(ds)) + + assert_equal_with_units(expected, result) + + @pytest.mark.xfail(reason="strips units") + def test_resample(self, dtype): + array1 = ( + np.linspace(-5, 5, 10 * 5).reshape(10, 5).astype(dtype) * unit_registry.degK + ) + array2 = ( + np.linspace(10, 20, 10 * 8).reshape(10, 8).astype(dtype) * unit_registry.Pa + ) + t = pd.date_range("10-09-2010", periods=array1.shape[0], freq="1y") + y = np.arange(5) * unit_registry.m + z = np.arange(8) * unit_registry.m + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims=("time", "y")), + "b": xr.DataArray(data=array2, dims=("time", "z")), + }, + coords={"time": t, "y": y, "z": z}, + ) + units = extract_units(ds) + + func = method("resample", time="6m") + + expected = attach_units(func(strip_units(ds)).mean(), units) + result = func(ds).mean() + + assert_equal_with_units(expected, result) + + @pytest.mark.parametrize( + "func", + ( + pytest.param( + method("assign", c=lambda ds: 10 * ds.b), + marks=pytest.mark.xfail(reason="strips units"), + ), + pytest.param( + method("assign_coords", v=("x", np.arange(10) * unit_registry.s)), + marks=pytest.mark.xfail(reason="strips units"), + ), + pytest.param(method("first")), + pytest.param(method("last")), + pytest.param( + method("quantile", q=[0.25, 0.5, 0.75], dim="x"), + marks=pytest.mark.xfail( + reason="dataset groupby does not implement quantile" + ), + ), + ), + ids=repr, + ) + def test_grouped_operations(self, func, dtype): + array1 = ( + np.linspace(-5, 5, 10 * 5).reshape(10, 5).astype(dtype) * unit_registry.degK + ) + array2 = ( + np.linspace(10, 20, 10 * 5 * 8).reshape(10, 5, 8).astype(dtype) + * unit_registry.Pa + ) + x = np.arange(10) * unit_registry.m + y = np.arange(5) * unit_registry.m + z = np.arange(8) * unit_registry.m + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims=("x", "y")), + "b": xr.DataArray(data=array2, dims=("x", "y", "z")), + }, + coords={"x": x, "y": y, "z": z}, + ) + units = extract_units(ds) + units.update({"c": unit_registry.Pa, "v": unit_registry.s}) + + stripped_kwargs = { + name: strip_units(value) for name, value in func.kwargs.items() + } + expected = attach_units( + func(strip_units(ds).groupby("y"), **stripped_kwargs), units + ) + result = func(ds.groupby("y")) + + assert_equal_with_units(expected, result) + + @pytest.mark.parametrize( + "func", + ( + method("pipe", lambda ds: ds * 10), + method("assign", d=lambda ds: ds.b * 10), + method("assign_coords", y2=("y", np.arange(5) * unit_registry.mm)), + method("assign_attrs", attr1="value"), + method("rename", x2="x_mm"), + method("rename_vars", c="temperature"), + method("rename_dims", x="offset_x"), + method("swap_dims", {"x": "x2"}), + method("expand_dims", v=np.linspace(10, 20, 12) * unit_registry.s, axis=1), + method("drop", labels="x"), + method("drop_dims", "z"), + method("set_coords", names="c"), + method("reset_coords", names="x2"), + method("copy"), + ), + ids=repr, + ) + def test_content_manipulation(self, func, dtype): + array1 = ( + np.linspace(-5, 5, 10 * 5).reshape(10, 5).astype(dtype) + * unit_registry.m ** 3 + ) + array2 = ( + np.linspace(10, 20, 10 * 5 * 8).reshape(10, 5, 8).astype(dtype) + * unit_registry.Pa + ) + array3 = np.linspace(0, 10, 10).astype(dtype) * unit_registry.degK + + x = np.arange(10) * unit_registry.m + x2 = x.to(unit_registry.mm) + y = np.arange(5) * unit_registry.m + z = np.arange(8) * unit_registry.m + + ds = xr.Dataset( + data_vars={ + "a": xr.DataArray(data=array1, dims=("x", "y")), + "b": xr.DataArray(data=array2, dims=("x", "y", "z")), + "c": xr.DataArray(data=array3, dims="x"), + }, + coords={"x": x, "y": y, "z": z, "x2": ("x", x2)}, + ) + units = extract_units(ds) + units.update( + { + "y2": unit_registry.mm, + "x_mm": unit_registry.mm, + "offset_x": unit_registry.m, + "d": unit_registry.Pa, + "temperature": unit_registry.degK, + } + ) + + stripped_kwargs = { + key: strip_units(value) for key, value in func.kwargs.items() + } + expected = attach_units(func(strip_units(ds), **stripped_kwargs), units) + result = func(ds) + + assert_equal_with_units(expected, result) + + @pytest.mark.xfail(reason="blocked by reindex") + @pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, xr.MergeError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, xr.MergeError, id="dimensionless" + ), + pytest.param(unit_registry.s, xr.MergeError, id="incompatible_unit"), + pytest.param(unit_registry.cm, xr.MergeError, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ) + @pytest.mark.parametrize("variant", ("data", "dims", "coords")) + def test_merge(self, variant, unit, error, dtype): + original_data_unit = unit_registry.m + original_dim_unit = unit_registry.m + original_coord_unit = unit_registry.m + + variants = { + "data": (unit, original_dim_unit, original_coord_unit), + "dims": (original_data_unit, unit, original_coord_unit), + "coords": (original_data_unit, original_dim_unit, unit), + } + data_unit, dim_unit, coord_unit = variants.get(variant) + + left_array = np.arange(10).astype(dtype) * original_data_unit + right_array = np.arange(-5, 5).astype(dtype) * data_unit + + left_dim = np.arange(10, 20) * original_dim_unit + right_dim = np.arange(5, 15) * dim_unit + + left_coord = np.arange(-10, 0) * original_coord_unit + right_coord = np.arange(-15, -5) * coord_unit + + left = xr.Dataset( + data_vars={"a": ("x", left_array)}, + coords={"x": left_dim, "y": ("x", left_coord)}, + ) + right = xr.Dataset( + data_vars={"a": ("x", right_array)}, + coords={"x": right_dim, "y": ("x", right_coord)}, + ) + + units = extract_units(left) + + if error is not None: + with pytest.raises(error): + left.merge(right) + + return + + converted = convert_units(right, units) + expected = attach_units(strip_units(left).merge(strip_units(converted)), units) + result = left.merge(right) + + assert_equal_with_units(expected, result) From db0f13d194845b06fa82f64574d0e78d8449ddbe Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Sat, 9 Nov 2019 16:10:22 -0500 Subject: [PATCH 02/22] Dataset.map, GroupBy.map, Resample.map (#3459) * rename dataset.apply to dataset.map, deprecating apply * use apply in deprecation test * adjust docs * add groupby rename, remove depreciation warnings (to pending) * change internal usages * formatting * whatsnew * docs * docs * internal usages * formatting * docstring, see also --- doc/computation.rst | 4 +-- doc/groupby.rst | 15 ++++++----- doc/howdoi.rst | 2 +- doc/quick-overview.rst | 2 +- doc/whats-new.rst | 7 +++++ xarray/core/dataarray.py | 11 +++++--- xarray/core/dataset.py | 34 ++++++++++++++++++++--- xarray/core/groupby.py | 49 +++++++++++++++++++++++++++------- xarray/core/resample.py | 43 ++++++++++++++++++++++++++--- xarray/tests/test_dataarray.py | 36 ++++++++++++------------- xarray/tests/test_dataset.py | 43 ++++++++++++++++------------- xarray/tests/test_groupby.py | 14 +++++----- xarray/tests/test_sparse.py | 2 +- 13 files changed, 186 insertions(+), 76 deletions(-) diff --git a/doc/computation.rst b/doc/computation.rst index ae5f4bc5c66..d477cb63d72 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -462,13 +462,13 @@ Datasets support most of the same methods found on data arrays: abs(ds) Datasets also support NumPy ufuncs (requires NumPy v1.13 or newer), or -alternatively you can use :py:meth:`~xarray.Dataset.apply` to apply a function +alternatively you can use :py:meth:`~xarray.Dataset.map` to map a function to each variable in a dataset: .. ipython:: python np.sin(ds) - ds.apply(np.sin) + ds.map(np.sin) Datasets also use looping over variables for *broadcasting* in binary arithmetic. You can do arithmetic between any ``DataArray`` and a dataset: diff --git a/doc/groupby.rst b/doc/groupby.rst index 52a27f4f160..f5943703765 100644 --- a/doc/groupby.rst +++ b/doc/groupby.rst @@ -35,10 +35,11 @@ Let's create a simple example dataset: .. ipython:: python - ds = xr.Dataset({'foo': (('x', 'y'), np.random.rand(4, 3))}, - coords={'x': [10, 20, 30, 40], - 'letters': ('x', list('abba'))}) - arr = ds['foo'] + ds = xr.Dataset( + {"foo": (("x", "y"), np.random.rand(4, 3))}, + coords={"x": [10, 20, 30, 40], "letters": ("x", list("abba"))}, + ) + arr = ds["foo"] ds If we groupby the name of a variable or coordinate in a dataset (we can also @@ -93,7 +94,7 @@ Apply ~~~~~ To apply a function to each group, you can use the flexible -:py:meth:`~xarray.DatasetGroupBy.apply` method. The resulting objects are automatically +:py:meth:`~xarray.DatasetGroupBy.map` method. The resulting objects are automatically concatenated back together along the group axis: .. ipython:: python @@ -101,7 +102,7 @@ concatenated back together along the group axis: def standardize(x): return (x - x.mean()) / x.std() - arr.groupby('letters').apply(standardize) + arr.groupby('letters').map(standardize) GroupBy objects also have a :py:meth:`~xarray.DatasetGroupBy.reduce` method and methods like :py:meth:`~xarray.DatasetGroupBy.mean` as shortcuts for applying an @@ -202,7 +203,7 @@ __ http://cfconventions.org/cf-conventions/v1.6.0/cf-conventions.html#_two_dimen dims=['ny','nx']) da da.groupby('lon').sum(...) - da.groupby('lon').apply(lambda x: x - x.mean(), shortcut=False) + da.groupby('lon').map(lambda x: x - x.mean(), shortcut=False) Because multidimensional groups have the ability to generate a very large number of bins, coarse-binning via :py:meth:`~xarray.Dataset.groupby_bins` diff --git a/doc/howdoi.rst b/doc/howdoi.rst index 721d1323e73..91644ba2718 100644 --- a/doc/howdoi.rst +++ b/doc/howdoi.rst @@ -44,7 +44,7 @@ How do I ... * - convert a possibly irregularly sampled timeseries to a regularly sampled timeseries - :py:meth:`DataArray.resample`, :py:meth:`Dataset.resample` (see :ref:`resampling` for more) * - apply a function on all data variables in a Dataset - - :py:meth:`Dataset.apply` + - :py:meth:`Dataset.map` * - write xarray objects with complex values to a netCDF file - :py:func:`Dataset.to_netcdf`, :py:func:`DataArray.to_netcdf` specifying ``engine="h5netcdf", invalid_netcdf=True`` * - make xarray objects look like other xarray objects diff --git a/doc/quick-overview.rst b/doc/quick-overview.rst index 7d84199323d..741b3d1a5fe 100644 --- a/doc/quick-overview.rst +++ b/doc/quick-overview.rst @@ -142,7 +142,7 @@ xarray supports grouped operations using a very similar API to pandas (see :ref: labels = xr.DataArray(['E', 'F', 'E'], [data.coords['y']], name='labels') labels data.groupby(labels).mean('y') - data.groupby(labels).apply(lambda x: x - x.min()) + data.groupby(labels).map(lambda x: x - x.min()) Plotting -------- diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d2a4b32a71f..6b3bfb42595 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -44,6 +44,13 @@ New Features option for dropping either labels or variables, but using the more specific methods is encouraged. (:pull:`3475`) By `Maximilian Roos `_ +- :py:meth:`Dataset.map` & :py:meth:`GroupBy.map` & :py:meth:`Resample.map` have been added for + mapping / applying a function over each item in the collection, reflecting the widely used + and least surprising name for this operation. + The existing ``apply`` methods remain for backward compatibility, though using the ``map`` + methods is encouraged. + (:pull:`3459`) + By `Maximilian Roos `_ - :py:meth:`Dataset.transpose` and :py:meth:`DataArray.transpose` now support an ellipsis (`...`) to represent all 'other' dimensions. For example, to move one dimension to the front, use `.transpose('x', ...)`. (:pull:`3421`) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 3e4c7903180..5e164f420c8 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -920,7 +920,7 @@ def copy(self, deep: bool = True, data: Any = None) -> "DataArray": Coordinates: * x (x) >> arr.identical(roundtripped) True - See also + See Also -------- DataArray.stack """ @@ -1923,6 +1923,11 @@ def drop( """Backward compatible method based on `drop_vars` and `drop_sel` Using either `drop_vars` or `drop_sel` is encouraged + + See Also + -------- + DataArray.drop_vars + DataArray.drop_sel """ ds = self._to_temp_dataset().drop(labels, dim, errors=errors) return self._from_temp_dataset(ds) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 2cadc90334c..dc5a315e72a 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3557,6 +3557,11 @@ def drop(self, labels=None, dim=None, *, errors="raise", **labels_kwargs): """Backward compatible method based on `drop_vars` and `drop_sel` Using either `drop_vars` or `drop_sel` is encouraged + + See Also + -------- + Dataset.drop_vars + Dataset.drop_sel """ if errors not in ["raise", "ignore"]: raise ValueError('errors must be either "raise" or "ignore"') @@ -4108,14 +4113,14 @@ def reduce( variables, coord_names=coord_names, attrs=attrs, indexes=indexes ) - def apply( + def map( self, func: Callable, keep_attrs: bool = None, args: Iterable[Any] = (), **kwargs: Any, ) -> "Dataset": - """Apply a function over the data variables in this dataset. + """Apply a function to each variable in this dataset Parameters ---------- @@ -4135,7 +4140,7 @@ def apply( Returns ------- applied : Dataset - Resulting dataset from applying ``func`` over each data variable. + Resulting dataset from applying ``func`` to each data variable. Examples -------- @@ -4148,7 +4153,7 @@ def apply( Data variables: foo (dim_0, dim_1) float64 -0.3751 -1.951 -1.945 0.2948 0.711 -0.3948 bar (x) int64 -1 2 - >>> ds.apply(np.fabs) + >>> ds.map(np.fabs) Dimensions: (dim_0: 2, dim_1: 3, x: 2) Dimensions without coordinates: dim_0, dim_1, x @@ -4165,6 +4170,27 @@ def apply( attrs = self.attrs if keep_attrs else None return type(self)(variables, attrs=attrs) + def apply( + self, + func: Callable, + keep_attrs: bool = None, + args: Iterable[Any] = (), + **kwargs: Any, + ) -> "Dataset": + """ + Backward compatible implementation of ``map`` + + See Also + -------- + Dataset.map + """ + warnings.warn( + "Dataset.apply may be deprecated in the future. Using Dataset.map is encouraged", + PendingDeprecationWarning, + stacklevel=2, + ) + return self.map(func, keep_attrs, args, **kwargs) + def assign( self, variables: Mapping[Hashable, Any] = None, **variables_kwargs: Hashable ) -> "Dataset": diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index c8906e34737..8ae65d9b9df 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -608,7 +608,7 @@ def assign_coords(self, coords=None, **coords_kwargs): Dataset.swap_dims """ coords_kwargs = either_dict_or_kwargs(coords, coords_kwargs, "assign_coords") - return self.apply(lambda ds: ds.assign_coords(**coords_kwargs)) + return self.map(lambda ds: ds.assign_coords(**coords_kwargs)) def _maybe_reorder(xarray_obj, dim, positions): @@ -655,8 +655,8 @@ def lookup_order(dimension): new_order = sorted(stacked.dims, key=lookup_order) return stacked.transpose(*new_order, transpose_coords=self._restore_coord_dims) - def apply(self, func, shortcut=False, args=(), **kwargs): - """Apply a function over each array in the group and concatenate them + def map(self, func, shortcut=False, args=(), **kwargs): + """Apply a function to each array in the group and concatenate them together into a new array. `func` is called like `func(ar, *args, **kwargs)` for each array `ar` @@ -702,6 +702,21 @@ def apply(self, func, shortcut=False, args=(), **kwargs): applied = (maybe_wrap_array(arr, func(arr, *args, **kwargs)) for arr in grouped) return self._combine(applied, shortcut=shortcut) + def apply(self, func, shortcut=False, args=(), **kwargs): + """ + Backward compatible implementation of ``map`` + + See Also + -------- + DataArrayGroupBy.map + """ + warnings.warn( + "GroupBy.apply may be deprecated in the future. Using GroupBy.map is encouraged", + PendingDeprecationWarning, + stacklevel=2, + ) + return self.map(func, shortcut=shortcut, args=args, **kwargs) + def _combine(self, applied, restore_coord_dims=False, shortcut=False): """Recombine the applied objects like the original.""" applied_example, applied = peek_at(applied) @@ -765,7 +780,7 @@ def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): if dim is None: dim = self._group_dim - out = self.apply( + out = self.map( self._obj.__class__.quantile, shortcut=False, q=q, @@ -820,7 +835,7 @@ def reduce_array(ar): check_reduce_dims(dim, self.dims) - return self.apply(reduce_array, shortcut=shortcut) + return self.map(reduce_array, shortcut=shortcut) ops.inject_reduce_methods(DataArrayGroupBy) @@ -828,8 +843,8 @@ def reduce_array(ar): class DatasetGroupBy(GroupBy, ImplementsDatasetReduce): - def apply(self, func, args=(), shortcut=None, **kwargs): - """Apply a function over each Dataset in the group and concatenate them + def map(self, func, args=(), shortcut=None, **kwargs): + """Apply a function to each Dataset in the group and concatenate them together into a new Dataset. `func` is called like `func(ds, *args, **kwargs)` for each dataset `ds` @@ -862,6 +877,22 @@ def apply(self, func, args=(), shortcut=None, **kwargs): applied = (func(ds, *args, **kwargs) for ds in self._iter_grouped()) return self._combine(applied) + def apply(self, func, args=(), shortcut=None, **kwargs): + """ + Backward compatible implementation of ``map`` + + See Also + -------- + DatasetGroupBy.map + """ + + warnings.warn( + "GroupBy.apply may be deprecated in the future. Using GroupBy.map is encouraged", + PendingDeprecationWarning, + stacklevel=2, + ) + return self.map(func, shortcut=shortcut, args=args, **kwargs) + def _combine(self, applied): """Recombine the applied objects like the original.""" applied_example, applied = peek_at(applied) @@ -914,7 +945,7 @@ def reduce_dataset(ds): check_reduce_dims(dim, self.dims) - return self.apply(reduce_dataset) + return self.map(reduce_dataset) def assign(self, **kwargs): """Assign data variables by group. @@ -923,7 +954,7 @@ def assign(self, **kwargs): -------- Dataset.assign """ - return self.apply(lambda ds: ds.assign(**kwargs)) + return self.map(lambda ds: ds.assign(**kwargs)) ops.inject_reduce_methods(DatasetGroupBy) diff --git a/xarray/core/resample.py b/xarray/core/resample.py index 2cb1bd55e19..fb388490d06 100644 --- a/xarray/core/resample.py +++ b/xarray/core/resample.py @@ -1,3 +1,5 @@ +import warnings + from . import ops from .groupby import DataArrayGroupBy, DatasetGroupBy @@ -173,8 +175,8 @@ def __init__(self, *args, dim=None, resample_dim=None, **kwargs): super().__init__(*args, **kwargs) - def apply(self, func, shortcut=False, args=(), **kwargs): - """Apply a function over each array in the group and concatenate them + def map(self, func, shortcut=False, args=(), **kwargs): + """Apply a function to each array in the group and concatenate them together into a new array. `func` is called like `func(ar, *args, **kwargs)` for each array `ar` @@ -212,7 +214,9 @@ def apply(self, func, shortcut=False, args=(), **kwargs): applied : DataArray or DataArray The result of splitting, applying and combining this array. """ - combined = super().apply(func, shortcut=shortcut, args=args, **kwargs) + # TODO: the argument order for Resample doesn't match that for its parent, + # GroupBy + combined = super().map(func, shortcut=shortcut, args=args, **kwargs) # If the aggregation function didn't drop the original resampling # dimension, then we need to do so before we can rename the proxy @@ -225,6 +229,21 @@ def apply(self, func, shortcut=False, args=(), **kwargs): return combined + def apply(self, func, args=(), shortcut=None, **kwargs): + """ + Backward compatible implementation of ``map`` + + See Also + -------- + DataArrayResample.map + """ + warnings.warn( + "Resample.apply may be deprecated in the future. Using Resample.map is encouraged", + PendingDeprecationWarning, + stacklevel=2, + ) + return self.map(func=func, shortcut=shortcut, args=args, **kwargs) + ops.inject_reduce_methods(DataArrayResample) ops.inject_binary_ops(DataArrayResample) @@ -247,7 +266,7 @@ def __init__(self, *args, dim=None, resample_dim=None, **kwargs): super().__init__(*args, **kwargs) - def apply(self, func, args=(), shortcut=None, **kwargs): + def map(self, func, args=(), shortcut=None, **kwargs): """Apply a function over each Dataset in the groups generated for resampling and concatenate them together into a new Dataset. @@ -282,6 +301,22 @@ def apply(self, func, args=(), shortcut=None, **kwargs): return combined.rename({self._resample_dim: self._dim}) + def apply(self, func, args=(), shortcut=None, **kwargs): + """ + Backward compatible implementation of ``map`` + + See Also + -------- + DataSetResample.map + """ + + warnings.warn( + "Resample.apply may be deprecated in the future. Using Resample.map is encouraged", + PendingDeprecationWarning, + stacklevel=2, + ) + return self.map(func=func, shortcut=shortcut, args=args, **kwargs) + def reduce(self, func, dim=None, keep_attrs=None, **kwargs): """Reduce the items in this group by applying `func` along the pre-defined resampling dimension. diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index acfe684d220..42fae2c9dd4 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -2417,7 +2417,7 @@ def test_groupby_properties(self): assert_array_equal(expected_groups[key], grouped.groups[key]) assert 3 == len(grouped) - def test_groupby_apply_identity(self): + def test_groupby_map_identity(self): expected = self.make_groupby_example_array() idx = expected.coords["y"] @@ -2428,7 +2428,7 @@ def identity(x): for shortcut in [False, True]: for squeeze in [False, True]: grouped = expected.groupby(g, squeeze=squeeze) - actual = grouped.apply(identity, shortcut=shortcut) + actual = grouped.map(identity, shortcut=shortcut) assert_identical(expected, actual) def test_groupby_sum(self): @@ -2461,7 +2461,7 @@ def test_groupby_sum(self): [["a", "b", "c"]], ["abc"], ) - actual = array["y"].groupby("abc").apply(np.sum) + actual = array["y"].groupby("abc").map(np.sum) assert_allclose(expected, actual) actual = array["y"].groupby("abc").sum(...) assert_allclose(expected, actual) @@ -2532,7 +2532,7 @@ def test_groupby_reduce_attrs(self): expected.attrs["foo"] = "bar" assert_identical(expected, actual) - def test_groupby_apply_center(self): + def test_groupby_map_center(self): def center(x): return x - np.mean(x) @@ -2545,16 +2545,16 @@ def center(x): ) expected_ds["foo"] = (["x", "y"], exp_data) expected_centered = expected_ds["foo"] - assert_allclose(expected_centered, grouped.apply(center)) + assert_allclose(expected_centered, grouped.map(center)) - def test_groupby_apply_ndarray(self): + def test_groupby_map_ndarray(self): # regression test for #326 array = self.make_groupby_example_array() grouped = array.groupby("abc") - actual = grouped.apply(np.asarray) + actual = grouped.map(np.asarray) assert_equal(array, actual) - def test_groupby_apply_changes_metadata(self): + def test_groupby_map_changes_metadata(self): def change_metadata(x): x.coords["x"] = x.coords["x"] * 2 x.attrs["fruit"] = "lemon" @@ -2562,7 +2562,7 @@ def change_metadata(x): array = self.make_groupby_example_array() grouped = array.groupby("abc") - actual = grouped.apply(change_metadata) + actual = grouped.map(change_metadata) expected = array.copy() expected = change_metadata(expected) assert_equal(expected, actual) @@ -2631,7 +2631,7 @@ def test_groupby_restore_dim_order(self): ("a", ("a", "y")), ("b", ("x", "b")), ]: - result = array.groupby(by).apply(lambda x: x.squeeze()) + result = array.groupby(by).map(lambda x: x.squeeze()) assert result.dims == expected_dims def test_groupby_restore_coord_dims(self): @@ -2651,13 +2651,13 @@ def test_groupby_restore_coord_dims(self): ("a", ("a", "y")), ("b", ("x", "b")), ]: - result = array.groupby(by, restore_coord_dims=True).apply( + result = array.groupby(by, restore_coord_dims=True).map( lambda x: x.squeeze() )["c"] assert result.dims == expected_dims with pytest.warns(FutureWarning): - array.groupby("x").apply(lambda x: x.squeeze()) + array.groupby("x").map(lambda x: x.squeeze()) def test_groupby_first_and_last(self): array = DataArray([1, 2, 3, 4, 5], dims="x") @@ -2699,9 +2699,9 @@ def test_groupby_multidim(self): actual_sum = array.groupby(dim).sum(...) assert_identical(expected_sum, actual_sum) - def test_groupby_multidim_apply(self): + def test_groupby_multidim_map(self): array = self.make_groupby_multidim_example_array() - actual = array.groupby("lon").apply(lambda x: x - x.mean()) + actual = array.groupby("lon").map(lambda x: x - x.mean()) expected = DataArray( [[[-2.5, -6.0], [-5.0, -8.5]], [[2.5, 3.0], [8.0, 8.5]]], coords=array.coords, @@ -2722,7 +2722,7 @@ def test_groupby_bins(self): ) # the problem with this is that it overwrites the dimensions of array! # actual = array.groupby('dim_0', bins=bins).sum() - actual = array.groupby_bins("dim_0", bins).apply(lambda x: x.sum()) + actual = array.groupby_bins("dim_0", bins).map(lambda x: x.sum()) assert_identical(expected, actual) # make sure original array dims are unchanged assert len(array.dim_0) == 4 @@ -2744,12 +2744,12 @@ def test_groupby_bins_multidim(self): bins = [0, 15, 20] bin_coords = pd.cut(array["lat"].values.flat, bins).categories expected = DataArray([16, 40], dims="lat_bins", coords={"lat_bins": bin_coords}) - actual = array.groupby_bins("lat", bins).apply(lambda x: x.sum()) + actual = array.groupby_bins("lat", bins).map(lambda x: x.sum()) assert_identical(expected, actual) # modify the array coordinates to be non-monotonic after unstacking array["lat"].data = np.array([[10.0, 20.0], [20.0, 10.0]]) expected = DataArray([28, 28], dims="lat_bins", coords={"lat_bins": bin_coords}) - actual = array.groupby_bins("lat", bins).apply(lambda x: x.sum()) + actual = array.groupby_bins("lat", bins).map(lambda x: x.sum()) assert_identical(expected, actual) def test_groupby_bins_sort(self): @@ -2784,7 +2784,7 @@ def func(arg1, arg2, arg3=0.0): times = pd.date_range("2000", periods=3, freq="D") da = xr.DataArray([1.0, 1.0, 1.0], coords=[times], dims=["time"]) expected = xr.DataArray([3.0, 3.0, 3.0], coords=[times], dims=["time"]) - actual = da.resample(time="D").apply(func, args=(1.0,), arg3=1.0) + actual = da.resample(time="D").map(func, args=(1.0,), arg3=1.0) assert_identical(actual, expected) def test_resample_first(self): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 50e78c9f685..d001c43da94 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -3310,17 +3310,17 @@ def identity(x): return x for k in ["x", "c", "y"]: - actual = data.groupby(k, squeeze=False).apply(identity) + actual = data.groupby(k, squeeze=False).map(identity) assert_equal(data, actual) def test_groupby_returns_new_type(self): data = Dataset({"z": (["x", "y"], np.random.randn(3, 5))}) - actual = data.groupby("x").apply(lambda ds: ds["z"]) + actual = data.groupby("x").map(lambda ds: ds["z"]) expected = data["z"] assert_identical(expected, actual) - actual = data["z"].groupby("x").apply(lambda x: x.to_dataset()) + actual = data["z"].groupby("x").map(lambda x: x.to_dataset()) expected = data assert_identical(expected, actual) @@ -3639,7 +3639,7 @@ def func(arg1, arg2, arg3=0.0): times = pd.date_range("2000", freq="D", periods=3) ds = xr.Dataset({"foo": ("time", [1.0, 1.0, 1.0]), "time": times}) expected = xr.Dataset({"foo": ("time", [3.0, 3.0, 3.0]), "time": times}) - actual = ds.resample(time="D").apply(func, args=(1.0,), arg3=1.0) + actual = ds.resample(time="D").map(func, args=(1.0,), arg3=1.0) assert_identical(expected, actual) def test_to_array(self): @@ -4515,31 +4515,36 @@ def test_count(self): actual = ds.count() assert_identical(expected, actual) - def test_apply(self): + def test_map(self): data = create_test_data() data.attrs["foo"] = "bar" - assert_identical(data.apply(np.mean), data.mean()) + assert_identical(data.map(np.mean), data.mean()) expected = data.mean(keep_attrs=True) - actual = data.apply(lambda x: x.mean(keep_attrs=True), keep_attrs=True) + actual = data.map(lambda x: x.mean(keep_attrs=True), keep_attrs=True) assert_identical(expected, actual) - assert_identical( - data.apply(lambda x: x, keep_attrs=True), data.drop_vars("time") - ) + assert_identical(data.map(lambda x: x, keep_attrs=True), data.drop_vars("time")) def scale(x, multiple=1): return multiple * x - actual = data.apply(scale, multiple=2) + actual = data.map(scale, multiple=2) assert_equal(actual["var1"], 2 * data["var1"]) assert_identical(actual["numbers"], data["numbers"]) - actual = data.apply(np.asarray) + actual = data.map(np.asarray) expected = data.drop_vars("time") # time is not used on a data var assert_equal(expected, actual) + def test_apply_pending_deprecated_map(self): + data = create_test_data() + data.attrs["foo"] = "bar" + + with pytest.warns(PendingDeprecationWarning): + assert_identical(data.apply(np.mean), data.mean()) + def make_example_math_dataset(self): variables = { "bar": ("x", np.arange(100, 400, 100)), @@ -4566,15 +4571,15 @@ def test_dataset_number_math(self): def test_unary_ops(self): ds = self.make_example_math_dataset() - assert_identical(ds.apply(abs), abs(ds)) - assert_identical(ds.apply(lambda x: x + 4), ds + 4) + assert_identical(ds.map(abs), abs(ds)) + assert_identical(ds.map(lambda x: x + 4), ds + 4) for func in [ lambda x: x.isnull(), lambda x: x.round(), lambda x: x.astype(int), ]: - assert_identical(ds.apply(func), func(ds)) + assert_identical(ds.map(func), func(ds)) assert_identical(ds.isnull(), ~ds.notnull()) @@ -4587,7 +4592,7 @@ def test_unary_ops(self): def test_dataset_array_math(self): ds = self.make_example_math_dataset() - expected = ds.apply(lambda x: x - ds["foo"]) + expected = ds.map(lambda x: x - ds["foo"]) assert_identical(expected, ds - ds["foo"]) assert_identical(expected, -ds["foo"] + ds) assert_identical(expected, ds - ds["foo"].variable) @@ -4596,7 +4601,7 @@ def test_dataset_array_math(self): actual -= ds["foo"] assert_identical(expected, actual) - expected = ds.apply(lambda x: x + ds["bar"]) + expected = ds.map(lambda x: x + ds["bar"]) assert_identical(expected, ds + ds["bar"]) actual = ds.copy(deep=True) actual += ds["bar"] @@ -4612,7 +4617,7 @@ def test_dataset_dataset_math(self): assert_identical(ds, ds + 0 * ds) assert_identical(ds, ds + {"foo": 0, "bar": 0}) - expected = ds.apply(lambda x: 2 * x) + expected = ds.map(lambda x: 2 * x) assert_identical(expected, 2 * ds) assert_identical(expected, ds + ds) assert_identical(expected, ds + ds.data_vars) @@ -4709,7 +4714,7 @@ def test_dataset_transpose(self): assert_identical(expected, actual) actual = ds.transpose("x", "y") - expected = ds.apply(lambda x: x.transpose("x", "y", transpose_coords=True)) + expected = ds.map(lambda x: x.transpose("x", "y", transpose_coords=True)) assert_identical(expected, actual) ds = create_test_data() diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index e2216547ac8..581affa3471 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -45,14 +45,14 @@ def test_groupby_dims_property(dataset): assert stacked.groupby("xy").dims == stacked.isel(xy=0).dims -def test_multi_index_groupby_apply(dataset): +def test_multi_index_groupby_map(dataset): # regression test for GH873 ds = dataset.isel(z=1, drop=True)[["foo"]] expected = 2 * ds actual = ( ds.stack(space=["x", "y"]) .groupby("space") - .apply(lambda x: 2 * x) + .map(lambda x: 2 * x) .unstack("space") ) assert_equal(expected, actual) @@ -107,23 +107,23 @@ def test_groupby_input_mutation(): assert_identical(array, array_copy) # should not modify inputs -def test_da_groupby_apply_func_args(): +def test_da_groupby_map_func_args(): def func(arg1, arg2, arg3=0): return arg1 + arg2 + arg3 array = xr.DataArray([1, 1, 1], [("x", [1, 2, 3])]) expected = xr.DataArray([3, 3, 3], [("x", [1, 2, 3])]) - actual = array.groupby("x").apply(func, args=(1,), arg3=1) + actual = array.groupby("x").map(func, args=(1,), arg3=1) assert_identical(expected, actual) -def test_ds_groupby_apply_func_args(): +def test_ds_groupby_map_func_args(): def func(arg1, arg2, arg3=0): return arg1 + arg2 + arg3 dataset = xr.Dataset({"foo": ("x", [1, 1, 1])}, {"x": [1, 2, 3]}) expected = xr.Dataset({"foo": ("x", [3, 3, 3])}, {"x": [1, 2, 3]}) - actual = dataset.groupby("x").apply(func, args=(1,), arg3=1) + actual = dataset.groupby("x").map(func, args=(1,), arg3=1) assert_identical(expected, actual) @@ -285,7 +285,7 @@ def test_groupby_drops_nans(): expected.variable.values[0, 0, :] = np.nan expected.variable.values[-1, -1, :] = np.nan expected.variable.values[3, 0, :] = np.nan - actual = grouped.apply(lambda x: x).transpose(*ds.variable.dims) + actual = grouped.map(lambda x: x).transpose(*ds.variable.dims) assert_identical(actual, expected) # reduction along grouped dimension diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py index 8e2d4b8e064..a31da162487 100644 --- a/xarray/tests/test_sparse.py +++ b/xarray/tests/test_sparse.py @@ -339,7 +339,7 @@ def test_dataarray_property(prop): (do("copy"), True), (do("count"), False), (do("diff", "x"), True), - (do("drop", "x"), True), + (do("drop_vars", "x"), True), (do("expand_dims", {"z": 2}, axis=2), True), (do("get_axis_num", "x"), False), (do("get_index", "x"), False), From 897b5d153a9fc12c072f8f6d8fa07f8deec679d3 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Sat, 9 Nov 2019 16:30:32 -0500 Subject: [PATCH 03/22] remove syntax warning (#3505) --- xarray/plot/plot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/plot/plot.py b/xarray/plot/plot.py index ca68f617144..5c754c3f49b 100644 --- a/xarray/plot/plot.py +++ b/xarray/plot/plot.py @@ -288,7 +288,7 @@ def line( ) # The allargs dict passed to _easy_facetgrid above contains args - if args is (): + if args == (): args = kwargs.pop("args", ()) else: assert "args" not in kwargs From 7ace0d580c09894e35f111f4adc04a3c845b01d5 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Sat, 9 Nov 2019 17:28:52 -0500 Subject: [PATCH 04/22] add drop_sel, drop_vars, map to api.rst (#3506) --- doc/api.rst | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index b176e3d5e3f..d2309f28226 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -94,7 +94,7 @@ Dataset contents Dataset.rename_dims Dataset.swap_dims Dataset.expand_dims - Dataset.drop + Dataset.drop_vars Dataset.drop_dims Dataset.set_coords Dataset.reset_coords @@ -118,6 +118,7 @@ Indexing Dataset.loc Dataset.isel Dataset.sel + Dataset.drop_sel Dataset.head Dataset.tail Dataset.thin @@ -154,7 +155,7 @@ Computation .. autosummary:: :toctree: generated/ - Dataset.apply + Dataset.map Dataset.reduce Dataset.groupby Dataset.groupby_bins @@ -263,7 +264,7 @@ DataArray contents DataArray.rename DataArray.swap_dims DataArray.expand_dims - DataArray.drop + DataArray.drop_vars DataArray.reset_coords DataArray.copy @@ -283,6 +284,7 @@ Indexing DataArray.loc DataArray.isel DataArray.sel + DataArray.drop_sel DataArray.head DataArray.tail DataArray.thin @@ -542,10 +544,10 @@ GroupBy objects :toctree: generated/ core.groupby.DataArrayGroupBy - core.groupby.DataArrayGroupBy.apply + core.groupby.DataArrayGroupBy.map core.groupby.DataArrayGroupBy.reduce core.groupby.DatasetGroupBy - core.groupby.DatasetGroupBy.apply + core.groupby.DatasetGroupBy.map core.groupby.DatasetGroupBy.reduce Rolling objects @@ -566,7 +568,7 @@ Resample objects ================ Resample objects also implement the GroupBy interface -(methods like ``apply()``, ``reduce()``, ``mean()``, ``sum()``, etc.). +(methods like ``map()``, ``reduce()``, ``mean()``, ``sum()``, etc.). .. autosummary:: :toctree: generated/ From f14edf326d41ce96be5d8fd42d56e80e3faf5ce7 Mon Sep 17 00:00:00 2001 From: Christopher Whelan Date: Sat, 9 Nov 2019 17:19:16 -0800 Subject: [PATCH 05/22] DOC: update bottleneck repo url (#3507) --- doc/computation.rst | 2 +- doc/dask.rst | 2 +- doc/installing.rst | 2 +- doc/whats-new.rst | 2 +- setup.cfg | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/computation.rst b/doc/computation.rst index d477cb63d72..663c546be20 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -183,7 +183,7 @@ a value when aggregating: Note that rolling window aggregations are faster and use less memory when bottleneck_ is installed. This only applies to numpy-backed xarray objects. -.. _bottleneck: https://github.com/kwgoodman/bottleneck/ +.. _bottleneck: https://github.com/pydata/bottleneck/ We can also manually iterate through ``Rolling`` objects: diff --git a/doc/dask.rst b/doc/dask.rst index 5bdbf779463..11f378aa376 100644 --- a/doc/dask.rst +++ b/doc/dask.rst @@ -292,7 +292,7 @@ For the best performance when using Dask's multi-threaded scheduler, wrap a function that already releases the global interpreter lock, which fortunately already includes most NumPy and Scipy functions. Here we show an example using NumPy operations and a fast function from -`bottleneck `__, which +`bottleneck `__, which we use to calculate `Spearman's rank-correlation coefficient `__: .. code-block:: python diff --git a/doc/installing.rst b/doc/installing.rst index 0c5e8916ca3..219cf109efe 100644 --- a/doc/installing.rst +++ b/doc/installing.rst @@ -43,7 +43,7 @@ For accelerating xarray - `scipy `__: necessary to enable the interpolation features for xarray objects -- `bottleneck `__: speeds up +- `bottleneck `__: speeds up NaN-skipping and rolling window aggregations by a large factor - `numbagg `_: for exponential rolling window operations diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 6b3bfb42595..d9c85a19ce6 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -3736,7 +3736,7 @@ Breaking changes warnings: methods and attributes that were deprecated in xray v0.3 or earlier (e.g., ``dimensions``, ``attributes```) have gone away. -.. _bottleneck: https://github.com/kwgoodman/bottleneck +.. _bottleneck: https://github.com/pydata/bottleneck Enhancements ~~~~~~~~~~~~ diff --git a/setup.cfg b/setup.cfg index fec2ca6bbe4..21158e3b0ee 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,7 +1,7 @@ [tool:pytest] python_files=test_*.py testpaths=xarray/tests properties -# Fixed upstream in https://github.com/kwgoodman/bottleneck/pull/199 +# Fixed upstream in https://github.com/pydata/bottleneck/pull/199 filterwarnings = ignore:Using a non-tuple sequence for multidimensional indexing is deprecated:FutureWarning env = From 4e9240a2087ffbf119919e1ac98046bbf164f94d Mon Sep 17 00:00:00 2001 From: keewis Date: Mon, 11 Nov 2019 00:41:51 +0100 Subject: [PATCH 06/22] add missing pint integration tests (#3508) * add tests for broadcast_like * add tests for DataArray head / tail / thin * update whats-new.rst --- doc/whats-new.rst | 2 +- xarray/tests/test_units.py | 107 +++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index d9c85a19ce6..96f0ba9a4a6 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -111,7 +111,7 @@ Internal Changes ~~~~~~~~~~~~~~~~ - Added integration tests against `pint `_. - (:pull:`3238`, :pull:`3447`) by `Justus Magin `_. + (:pull:`3238`, :pull:`3447`, :pull:`3508`) by `Justus Magin `_. .. note:: diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index 8eed1f0dbe3..fd9e9b039ac 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -1045,6 +1045,36 @@ def test_comparisons(self, func, variation, unit, dtype): assert expected == result + @pytest.mark.xfail(reason="blocked by `where`") + @pytest.mark.parametrize( + "unit", + ( + pytest.param(1, id="no_unit"), + pytest.param(unit_registry.dimensionless, id="dimensionless"), + pytest.param(unit_registry.s, id="incompatible_unit"), + pytest.param(unit_registry.cm, id="compatible_unit"), + pytest.param(unit_registry.m, id="identical_unit"), + ), + ) + def test_broadcast_like(self, unit, dtype): + array1 = np.linspace(1, 2, 2 * 1).reshape(2, 1).astype(dtype) * unit_registry.Pa + array2 = np.linspace(0, 1, 2 * 3).reshape(2, 3).astype(dtype) * unit_registry.Pa + + x1 = np.arange(2) * unit_registry.m + x2 = np.arange(2) * unit + y1 = np.array([0]) * unit_registry.m + y2 = np.arange(3) * unit + + arr1 = xr.DataArray(data=array1, coords={"x": x1, "y": y1}, dims=("x", "y")) + arr2 = xr.DataArray(data=array2, coords={"x": x2, "y": y2}, dims=("x", "y")) + + expected = attach_units( + strip_units(arr1).broadcast_like(strip_units(arr2)), extract_units(arr1) + ) + result = arr1.broadcast_like(arr2) + + assert_equal_with_units(expected, result) + @pytest.mark.parametrize( "unit", ( @@ -1303,6 +1333,49 @@ def test_squeeze(self, shape, dtype): np.squeeze(array, axis=index), data_array.squeeze(dim=name) ) + @pytest.mark.xfail( + reason="indexes strip units and head / tail / thin only support integers" + ) + @pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.cm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ) + @pytest.mark.parametrize( + "func", + (method("head", x=7, y=3), method("tail", x=7, y=3), method("thin", x=7, y=3)), + ids=repr, + ) + def test_head_tail_thin(self, func, unit, error, dtype): + array = np.linspace(1, 2, 10 * 5).reshape(10, 5) * unit_registry.degK + + coords = { + "x": np.arange(10) * unit_registry.m, + "y": np.arange(5) * unit_registry.m, + } + + arr = xr.DataArray(data=array, coords=coords, dims=("x", "y")) + + kwargs = {name: value * unit for name, value in func.kwargs.items()} + + if error is not None: + with pytest.raises(error): + func(arr, **kwargs) + + return + + expected = attach_units(func(strip_units(arr)), extract_units(arr)) + result = func(arr, **kwargs) + + assert_equal_with_units(expected, result) + @pytest.mark.parametrize( "unit,error", ( @@ -2472,6 +2545,40 @@ def test_comparisons(self, func, variation, unit, dtype): assert expected == result + @pytest.mark.xfail(reason="blocked by `where`") + @pytest.mark.parametrize( + "unit", + ( + pytest.param(1, id="no_unit"), + pytest.param(unit_registry.dimensionless, id="dimensionless"), + pytest.param(unit_registry.s, id="incompatible_unit"), + pytest.param(unit_registry.cm, id="compatible_unit"), + pytest.param(unit_registry.m, id="identical_unit"), + ), + ) + def test_broadcast_like(self, unit, dtype): + array1 = np.linspace(1, 2, 2 * 1).reshape(2, 1).astype(dtype) * unit_registry.Pa + array2 = np.linspace(0, 1, 2 * 3).reshape(2, 3).astype(dtype) * unit_registry.Pa + + x1 = np.arange(2) * unit_registry.m + x2 = np.arange(2) * unit + y1 = np.array([0]) * unit_registry.m + y2 = np.arange(3) * unit + + ds1 = xr.Dataset( + data_vars={"a": (("x", "y"), array1)}, coords={"x": x1, "y": y1} + ) + ds2 = xr.Dataset( + data_vars={"a": (("x", "y"), array2)}, coords={"x": x2, "y": y2} + ) + + expected = attach_units( + strip_units(ds1).broadcast_like(strip_units(ds2)), extract_units(ds1) + ) + result = ds1.broadcast_like(ds2) + + assert_equal_with_units(expected, result) + @pytest.mark.parametrize( "unit", ( From b74f80ca2df4920f711f9fe5762458c53ce3c2c6 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Mon, 11 Nov 2019 23:31:38 -0500 Subject: [PATCH 07/22] format indexing.rst code with black (#3511) --- doc/indexing.rst | 138 ++++++++++++++++++++++++++++------------------- 1 file changed, 82 insertions(+), 56 deletions(-) diff --git a/doc/indexing.rst b/doc/indexing.rst index ace960689a8..e8482ac66b3 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -209,12 +209,16 @@ simultaneously, returning a new dataset: .. ipython:: python - da = xr.DataArray(np.random.rand(4, 3), - [('time', pd.date_range('2000-01-01', periods=4)), - ('space', ['IA', 'IL', 'IN'])]) - ds = da.to_dataset(name='foo') + da = xr.DataArray( + np.random.rand(4, 3), + [ + ("time", pd.date_range("2000-01-01", periods=4)), + ("space", ["IA", "IL", "IN"]), + ], + ) + ds = da.to_dataset(name="foo") ds.isel(space=[0], time=[0]) - ds.sel(time='2000-01-01') + ds.sel(time="2000-01-01") Positional indexing on a dataset is not supported because the ordering of dimensions in a dataset is somewhat ambiguous (it can vary between different @@ -222,7 +226,6 @@ arrays). However, you can do normal indexing with dimension names: .. ipython:: python - ds[dict(space=[0], time=[0])] ds.loc[dict(time='2000-01-01')] @@ -248,7 +251,6 @@ Any variables with these dimensions are also dropped: ds.drop_dims('time') - .. _masking with where: Masking with ``where`` @@ -326,8 +328,12 @@ MATLAB, or after using the :py:func:`numpy.ix_` helper: .. ipython:: python - da = xr.DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y'], - coords={'x': [0, 1, 2], 'y': ['a', 'b', 'c', 'd']}) + + da = xr.DataArray( + np.arange(12).reshape((3, 4)), + dims=["x", "y"], + coords={"x": [0, 1, 2], "y": ["a", "b", "c", "d"]}, + ) da da[[0, 1], [1, 1]] @@ -410,43 +416,56 @@ can use indexing with ``.loc`` : .. ipython:: python - ds = xr.tutorial.open_dataset('air_temperature') + ds = xr.tutorial.open_dataset("air_temperature") - #add an empty 2D dataarray - ds['empty']= xr.full_like(ds.air.mean('time'),fill_value=0) + # add an empty 2D dataarray + ds["empty"] = xr.full_like(ds.air.mean("time"), fill_value=0) - #modify one grid point using loc() - ds['empty'].loc[dict(lon=260, lat=30)] = 100 + # modify one grid point using loc() + ds["empty"].loc[dict(lon=260, lat=30)] = 100 - #modify a 2D region using loc() - lc = ds.coords['lon'] - la = ds.coords['lat'] - ds['empty'].loc[dict(lon=lc[(lc>220)&(lc<260)], lat=la[(la>20)&(la<60)])] = 100 + # modify a 2D region using loc() + lc = ds.coords["lon"] + la = ds.coords["lat"] + ds["empty"].loc[ + dict(lon=lc[(lc > 220) & (lc < 260)], lat=la[(la > 20) & (la < 60)]) + ] = 100 or :py:meth:`~xarray.where`: .. ipython:: python - #modify one grid point using xr.where() - ds['empty'] = xr.where((ds.coords['lat']==20)&(ds.coords['lon']==260), 100, ds['empty']) + # modify one grid point using xr.where() + ds["empty"] = xr.where( + (ds.coords["lat"] == 20) & (ds.coords["lon"] == 260), 100, ds["empty"] + ) + + # or modify a 2D region using xr.where() + mask = ( + (ds.coords["lat"] > 20) + & (ds.coords["lat"] < 60) + & (ds.coords["lon"] > 220) + & (ds.coords["lon"] < 260) + ) + ds["empty"] = xr.where(mask, 100, ds["empty"]) - #or modify a 2D region using xr.where() - mask = (ds.coords['lat']>20)&(ds.coords['lat']<60)&(ds.coords['lon']>220)&(ds.coords['lon']<260) - ds['empty'] = xr.where(mask, 100, ds['empty']) Vectorized indexing can also be used to assign values to xarray object. .. ipython:: python - da = xr.DataArray(np.arange(12).reshape((3, 4)), dims=['x', 'y'], - coords={'x': [0, 1, 2], 'y': ['a', 'b', 'c', 'd']}) + da = xr.DataArray( + np.arange(12).reshape((3, 4)), + dims=["x", "y"], + coords={"x": [0, 1, 2], "y": ["a", "b", "c", "d"]}, + ) da da[0] = -1 # assignment with broadcasting da - ind_x = xr.DataArray([0, 1], dims=['x']) - ind_y = xr.DataArray([0, 1], dims=['y']) + ind_x = xr.DataArray([0, 1], dims=["x"]) + ind_y = xr.DataArray([0, 1], dims=["y"]) da[ind_x, ind_y] = -2 # assign -2 to (ix, iy) = (0, 0) and (1, 1) da @@ -508,10 +527,10 @@ flexible indexing. The following is an example of the pointwise indexing: .. ipython:: python - da = xr.DataArray(np.arange(56).reshape((7, 8)), dims=['x', 'y']) + da = xr.DataArray(np.arange(56).reshape((7, 8)), dims=["x", "y"]) da - da.isel(x=xr.DataArray([0, 1, 6], dims='z'), - y=xr.DataArray([0, 1, 0], dims='z')) + da.isel(x=xr.DataArray([0, 1, 6], dims="z"), y=xr.DataArray([0, 1, 0], dims="z")) + where three elements at ``(ix, iy) = ((0, 0), (1, 1), (6, 0))`` are selected and mapped along a new dimension ``z``. @@ -521,23 +540,27 @@ you can supply a :py:class:`~xarray.DataArray` with a coordinate, .. ipython:: python - da.isel(x=xr.DataArray([0, 1, 6], dims='z', - coords={'z': ['a', 'b', 'c']}), - y=xr.DataArray([0, 1, 0], dims='z')) - + da.isel( + x=xr.DataArray([0, 1, 6], dims="z", coords={"z": ["a", "b", "c"]}), + y=xr.DataArray([0, 1, 0], dims="z"), + ) + Analogously, label-based pointwise-indexing is also possible by the ``.sel`` method: .. ipython:: python - da = xr.DataArray(np.random.rand(4, 3), - [('time', pd.date_range('2000-01-01', periods=4)), - ('space', ['IA', 'IL', 'IN'])]) - times = xr.DataArray(pd.to_datetime(['2000-01-03', '2000-01-02', '2000-01-01']), - dims='new_time') - da.sel(space=xr.DataArray(['IA', 'IL', 'IN'], dims=['new_time']), - time=times) - + da = xr.DataArray( + np.random.rand(4, 3), + [ + ("time", pd.date_range("2000-01-01", periods=4)), + ("space", ["IA", "IL", "IN"]), + ], + ) + times = xr.DataArray( + pd.to_datetime(["2000-01-03", "2000-01-02", "2000-01-01"]), dims="new_time" + ) + da.sel(space=xr.DataArray(["IA", "IL", "IN"], dims=["new_time"]), time=times) .. _align and reindex: @@ -635,12 +658,16 @@ through the :py:attr:`~xarray.DataArray.indexes` attribute. .. ipython:: python - da = xr.DataArray(np.random.rand(4, 3), - [('time', pd.date_range('2000-01-01', periods=4)), - ('space', ['IA', 'IL', 'IN'])]) + da = xr.DataArray( + np.random.rand(4, 3), + [ + ("time", pd.date_range("2000-01-01", periods=4)), + ("space", ["IA", "IL", "IN"]), + ], + ) da da.indexes - da.indexes['time'] + da.indexes["time"] Use :py:meth:`~xarray.DataArray.get_index` to get an index for a dimension, falling back to a default :py:class:`pandas.RangeIndex` if it has no coordinate @@ -694,32 +721,31 @@ pandas: .. ipython:: python - midx = pd.MultiIndex.from_product([list('abc'), [0, 1]], - names=('one', 'two')) - mda = xr.DataArray(np.random.rand(6, 3), - [('x', midx), ('y', range(3))]) - mda - mda.sel(x=(list('ab'), [0])) + + midx = pd.MultiIndex.from_product([list("abc"), [0, 1]], names=("one", "two")) + mda = xr.DataArray(np.random.rand(6, 3), [("x", midx), ("y", range(3))]) + mda + mda.sel(x=(list("ab"), [0])) You can also select multiple elements by providing a list of labels or tuples or a slice of tuples: .. ipython:: python - mda.sel(x=[('a', 0), ('b', 1)]) + mda.sel(x=[('a', 0), ('b', 1)]) Additionally, xarray supports dictionaries: .. ipython:: python - mda.sel(x={'one': 'a', 'two': 0}) + mda.sel(x={'one': 'a', 'two': 0}) For convenience, ``sel`` also accepts multi-index levels directly as keyword arguments: .. ipython:: python - mda.sel(one='a', two=0) + mda.sel(one='a', two=0) Note that using ``sel`` it is not possible to mix a dimension indexer with level indexers for that dimension @@ -731,7 +757,7 @@ multi-index is reduced to a single index. .. ipython:: python - mda.loc[{'one': 'a'}, ...] + mda.loc[{'one': 'a'}, ...] Unlike pandas, xarray does not guess whether you provide index levels or dimensions when using ``loc`` in some ambiguous cases. For example, for From e70138b61033081e3bfab3aaaec5997716cd7109 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Wed, 13 Nov 2019 00:53:26 +0000 Subject: [PATCH 08/22] Recursive tokenization (#3515) * recursive tokenize * black * What's New * Also test Dataset * Also test IndexVariable * Cleanup * tokenize sparse objects --- doc/whats-new.rst | 2 +- xarray/core/dataarray.py | 4 +++- xarray/core/dataset.py | 6 +++++- xarray/core/variable.py | 8 ++++++-- xarray/tests/test_dask.py | 26 ++++++++++++++++++++++++++ xarray/tests/test_sparse.py | 4 ++++ 6 files changed, 45 insertions(+), 5 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 96f0ba9a4a6..620617c127a 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -73,7 +73,7 @@ New Features for xarray objects. Note that xarray objects with a dask.array backend already used deterministic hashing in previous releases; this change implements it when whole xarray objects are embedded in a dask graph, e.g. when :meth:`DataArray.map` is - invoked. (:issue:`3378`, :pull:`3446`) + invoked. (:issue:`3378`, :pull:`3446`, :pull:`3515`) By `Deepak Cherian `_ and `Guido Imperiale `_. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 5e164f420c8..a192fe08cee 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -755,7 +755,9 @@ def reset_coords( return dataset def __dask_tokenize__(self): - return (type(self), self._variable, self._coords, self._name) + from dask.base import normalize_token + + return normalize_token((type(self), self._variable, self._coords, self._name)) def __dask_graph__(self): return self._to_temp_dataset().__dask_graph__() diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index dc5a315e72a..fe8abdc4b95 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -652,7 +652,11 @@ def load(self, **kwargs) -> "Dataset": return self def __dask_tokenize__(self): - return (type(self), self._variables, self._coord_names, self._attrs) + from dask.base import normalize_token + + return normalize_token( + (type(self), self._variables, self._coord_names, self._attrs) + ) def __dask_graph__(self): graphs = {k: v.__dask_graph__() for k, v in self.variables.items()} diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 916df75b3e0..f842a4a9428 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -393,7 +393,9 @@ def compute(self, **kwargs): def __dask_tokenize__(self): # Use v.data, instead of v._data, in order to cope with the wrappers # around NetCDF and the like - return type(self), self._dims, self.data, self._attrs + from dask.base import normalize_token + + return normalize_token((type(self), self._dims, self.data, self._attrs)) def __dask_graph__(self): if isinstance(self._data, dask_array_type): @@ -1973,8 +1975,10 @@ def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False): self._data = PandasIndexAdapter(self._data) def __dask_tokenize__(self): + from dask.base import normalize_token + # Don't waste time converting pd.Index to np.ndarray - return (type(self), self._dims, self._data.array, self._attrs) + return normalize_token((type(self), self._dims, self._data.array, self._attrs)) def load(self): # data is already loaded into memory for IndexVariable diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index fa8ae9991d7..43b788153bc 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1283,6 +1283,32 @@ def test_token_identical(obj, transform): ) +def test_recursive_token(): + """Test that tokenization is invoked recursively, and doesn't just rely on the + output of str() + """ + a = np.ones(10000) + b = np.ones(10000) + b[5000] = 2 + assert str(a) == str(b) + assert dask.base.tokenize(a) != dask.base.tokenize(b) + + # Test DataArray and Variable + da_a = DataArray(a) + da_b = DataArray(b) + assert dask.base.tokenize(da_a) != dask.base.tokenize(da_b) + + # Test Dataset + ds_a = da_a.to_dataset(name="x") + ds_b = da_b.to_dataset(name="x") + assert dask.base.tokenize(ds_a) != dask.base.tokenize(ds_b) + + # Test IndexVariable + da_a = DataArray(a, dims=["x"], coords={"x": a}) + da_b = DataArray(a, dims=["x"], coords={"x": b}) + assert dask.base.tokenize(da_a) != dask.base.tokenize(da_b) + + @requires_scipy_or_netCDF4 def test_normalize_token_with_backend(map_ds): with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as tmp_file: diff --git a/xarray/tests/test_sparse.py b/xarray/tests/test_sparse.py index a31da162487..a02fef2faeb 100644 --- a/xarray/tests/test_sparse.py +++ b/xarray/tests/test_sparse.py @@ -856,6 +856,10 @@ def test_dask_token(): import dask s = sparse.COO.from_numpy(np.array([0, 0, 1, 2])) + + # https://github.com/pydata/sparse/issues/300 + s.__dask_tokenize__ = lambda: dask.base.normalize_token(s.__dict__) + a = DataArray(s) t1 = dask.base.tokenize(a) t2 = dask.base.tokenize(a) From 94525bbaf417476dbe9a70b98801ae04aceaebf3 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 13 Nov 2019 15:48:45 +0000 Subject: [PATCH 09/22] Deprecate allow_lazy (#3435) * Deprecate allow_lazy * add whats-new * test that reductions are lazy * minor whats-new fix. * fix merge wahts=new * fix bad merge. * remove tests that only work with nep-18 * Update doc/whats-new.rst Co-Authored-By: Mathias Hauser * Update xarray/core/variable.py Co-Authored-By: Mathias Hauser * fix whats-new * Fix test that assumed NEP-18 * fix tests. --- doc/whats-new.rst | 3 +++ xarray/core/common.py | 17 ++++------------- xarray/core/dataset.py | 2 +- xarray/core/groupby.py | 4 +--- xarray/core/variable.py | 13 ++++++++++++- xarray/tests/test_dask.py | 18 ++++++++++++++++-- xarray/tests/test_variable.py | 4 ++++ 7 files changed, 41 insertions(+), 20 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 620617c127a..212e465b368 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -88,6 +88,9 @@ Bug fixes By `Deepak Cherian `_. - Sync with cftime by removing `dayofwk=-1` for cftime>=1.0.4. By `Anderson Banihirwe `_. +- Rolling reduction operations no longer compute dask arrays by default. (:issue:`3161`). + In addition, the ``allow_lazy`` kwarg to ``reduce`` is deprecated. + By `Deepak Cherian `_. - Fix :py:meth:`xarray.core.groupby.DataArrayGroupBy.reduce` and :py:meth:`xarray.core.groupby.DatasetGroupBy.reduce` when reducing over multiple dimensions. (:issue:`3402`). By `Deepak Cherian `_ diff --git a/xarray/core/common.py b/xarray/core/common.py index d372115ea57..2afe4b4c3a7 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -43,14 +43,12 @@ def _reduce_method(cls, func: Callable, include_skipna: bool, numeric_only: bool if include_skipna: def wrapped_func(self, dim=None, axis=None, skipna=None, **kwargs): - return self.reduce( - func, dim, axis, skipna=skipna, allow_lazy=True, **kwargs - ) + return self.reduce(func, dim, axis, skipna=skipna, **kwargs) else: def wrapped_func(self, dim=None, axis=None, **kwargs): # type: ignore - return self.reduce(func, dim, axis, allow_lazy=True, **kwargs) + return self.reduce(func, dim, axis, **kwargs) return wrapped_func @@ -83,20 +81,13 @@ def _reduce_method(cls, func: Callable, include_skipna: bool, numeric_only: bool def wrapped_func(self, dim=None, skipna=None, **kwargs): return self.reduce( - func, - dim, - skipna=skipna, - numeric_only=numeric_only, - allow_lazy=True, - **kwargs, + func, dim, skipna=skipna, numeric_only=numeric_only, **kwargs ) else: def wrapped_func(self, dim=None, **kwargs): # type: ignore - return self.reduce( - func, dim, numeric_only=numeric_only, allow_lazy=True, **kwargs - ) + return self.reduce(func, dim, numeric_only=numeric_only, **kwargs) return wrapped_func diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index fe8abdc4b95..15a7209ab24 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -4031,7 +4031,7 @@ def reduce( keep_attrs: bool = None, keepdims: bool = False, numeric_only: bool = False, - allow_lazy: bool = False, + allow_lazy: bool = None, **kwargs: Any, ) -> "Dataset": """Reduce this dataset by applying `func` along some dimension(s). diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 8ae65d9b9df..c73ee3cf7c5 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -585,9 +585,7 @@ def _first_or_last(self, op, skipna, keep_attrs): return self._obj if keep_attrs is None: keep_attrs = _get_keep_attrs(default=True) - return self.reduce( - op, self._group_dim, skipna=skipna, keep_attrs=keep_attrs, allow_lazy=True - ) + return self.reduce(op, self._group_dim, skipna=skipna, keep_attrs=keep_attrs) def first(self, skipna=None, keep_attrs=None): """Return the first element of each group along the group dimension diff --git a/xarray/core/variable.py b/xarray/core/variable.py index f842a4a9428..cf97c997017 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1,5 +1,6 @@ import functools import itertools +import warnings from collections import defaultdict from datetime import timedelta from distutils.version import LooseVersion @@ -1427,7 +1428,7 @@ def reduce( axis=None, keep_attrs=None, keepdims=False, - allow_lazy=False, + allow_lazy=None, **kwargs, ): """Reduce this array by applying `func` along some dimension(s). @@ -1468,7 +1469,17 @@ def reduce( if dim is not None: axis = self.get_axis_num(dim) + + if allow_lazy is not None: + warnings.warn( + "allow_lazy is deprecated and will be removed in version 0.16.0. It is now True by default.", + DeprecationWarning, + ) + else: + allow_lazy = True + input_data = self.data if allow_lazy else self.values + if axis is not None: data = func(input_data, axis=axis, **kwargs) else: diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 43b788153bc..4c1f317342f 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -12,6 +12,7 @@ import xarray as xr import xarray.ufuncs as xu from xarray import DataArray, Dataset, Variable +from xarray.core import duck_array_ops from xarray.testing import assert_chunks_equal from xarray.tests import mock @@ -217,6 +218,8 @@ def test_reduce(self): self.assertLazyAndAllClose((u < 1).all("x"), (v < 1).all("x")) with raises_regex(NotImplementedError, "dask"): v.median() + with raise_if_dask_computes(): + v.reduce(duck_array_ops.mean) def test_missing_values(self): values = np.array([0, 1, np.nan, 3]) @@ -488,7 +491,17 @@ def test_groupby(self): v = self.lazy_array expected = u.groupby("x").mean(...) - actual = v.groupby("x").mean(...) + with raise_if_dask_computes(): + actual = v.groupby("x").mean(...) + self.assertLazyAndAllClose(expected, actual) + + def test_rolling(self): + u = self.eager_array + v = self.lazy_array + + expected = u.rolling(x=2).mean() + with raise_if_dask_computes(): + actual = v.rolling(x=2).mean() self.assertLazyAndAllClose(expected, actual) def test_groupby_first(self): @@ -500,7 +513,8 @@ def test_groupby_first(self): with raises_regex(NotImplementedError, "dask"): v.groupby("ab").first() expected = u.groupby("ab").first() - actual = v.groupby("ab").first(skipna=False) + with raise_if_dask_computes(): + actual = v.groupby("ab").first(skipna=False) self.assertLazyAndAllClose(expected, actual) def test_reindex(self): diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 528027ed149..d394919dbdd 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -1477,6 +1477,10 @@ def test_reduce(self): with raises_regex(ValueError, "cannot supply both"): v.mean(dim="x", axis=0) + with pytest.warns(DeprecationWarning, match="allow_lazy is deprecated"): + v.mean(dim="x", allow_lazy=True) + with pytest.warns(DeprecationWarning, match="allow_lazy is deprecated"): + v.mean(dim="x", allow_lazy=False) def test_quantile(self): v = Variable(["x", "y"], self.d) From 7241aa12ae168f7af6efcf13f8012158a1331cb3 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Wed, 13 Nov 2019 15:53:34 +0000 Subject: [PATCH 10/22] warn if dim is passed to rolling operations. (#3513) * warn if dim is passed to rolling operations. * Update doc/whats-new.rst Co-Authored-By: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> * Update xarray/core/rolling.py Co-Authored-By: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> --- doc/whats-new.rst | 3 +++ xarray/core/rolling.py | 9 +++++++++ xarray/tests/test_dataarray.py | 6 ++++++ 3 files changed, 18 insertions(+) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 212e465b368..f042f846c39 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -220,6 +220,9 @@ Bug fixes By `Deepak Cherian `_. - Fix error in concatenating unlabeled dimensions (:pull:`3362`). By `Deepak Cherian `_. +- Warn if the ``dim`` kwarg is passed to rolling operations. This is redundant since a dimension is + specified when the :py:class:`DatasetRolling` or :py:class:`DataArrayRolling` object is created. + (:pull:`3362`). By `Deepak Cherian `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/core/rolling.py b/xarray/core/rolling.py index f4e571a8efe..a1864332f4d 100644 --- a/xarray/core/rolling.py +++ b/xarray/core/rolling.py @@ -1,4 +1,5 @@ import functools +import warnings from typing import Callable import numpy as np @@ -351,6 +352,14 @@ def _bottleneck_reduce(self, func, **kwargs): def _numpy_or_bottleneck_reduce( self, array_agg_func, bottleneck_move_func, **kwargs ): + if "dim" in kwargs: + warnings.warn( + f"Reductions will be applied along the rolling dimension '{self.dim}'. Passing the 'dim' kwarg to reduction operations has no effect and will raise an error in xarray 0.16.0.", + DeprecationWarning, + stacklevel=3, + ) + del kwargs["dim"] + if bottleneck_move_func is not None and not isinstance( self.obj.data, dask_array_type ): diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 42fae2c9dd4..7c6dc1825a1 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -4188,6 +4188,9 @@ def test_rolling_wrapped_bottleneck(da, name, center, min_periods): ) assert_array_equal(actual.values, expected) + with pytest.warns(DeprecationWarning, match="Reductions will be applied"): + getattr(rolling_obj, name)(dim="time") + # Test center rolling_obj = da.rolling(time=7, center=center) actual = getattr(rolling_obj, name)()["time"] @@ -4203,6 +4206,9 @@ def test_rolling_wrapped_dask(da_dask, name, center, min_periods, window): # dask version rolling_obj = da_dask.rolling(time=window, min_periods=min_periods, center=center) actual = getattr(rolling_obj, name)().load() + if name != "count": + with pytest.warns(DeprecationWarning, match="Reductions will be applied"): + getattr(rolling_obj, name)(dim="time") # numpy version rolling_obj = da_dask.load().rolling( time=window, min_periods=min_periods, center=center From 40588dc38ddc2d573e3dc8c63b2e6533eb978656 Mon Sep 17 00:00:00 2001 From: Akihiro Matsukawa Date: Wed, 13 Nov 2019 10:55:32 -0500 Subject: [PATCH 11/22] Allow appending datetime & boolean variables to zarr stores (#3504) * Allow appending datetime and boolean data variables to zarr stores. * Run black and flake8 * Update error message --- doc/whats-new.rst | 2 ++ xarray/backends/api.py | 7 +++++-- xarray/tests/test_dataset.py | 14 ++++++++++++++ 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f042f846c39..ea3b012cc98 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -94,6 +94,8 @@ Bug fixes - Fix :py:meth:`xarray.core.groupby.DataArrayGroupBy.reduce` and :py:meth:`xarray.core.groupby.DatasetGroupBy.reduce` when reducing over multiple dimensions. (:issue:`3402`). By `Deepak Cherian `_ +- Allow appending datetime and bool data variables to zarr stores. + (:issue:`3480`). By `Akihiro Matsukawa `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index d23594fc675..945b3937c43 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1234,6 +1234,8 @@ def _validate_datatypes_for_zarr_append(dataset): def check_dtype(var): if ( not np.issubdtype(var.dtype, np.number) + and not np.issubdtype(var.dtype, np.datetime64) + and not np.issubdtype(var.dtype, np.bool) and not coding.strings.is_unicode_dtype(var.dtype) and not var.dtype == object ): @@ -1241,8 +1243,9 @@ def check_dtype(var): raise ValueError( "Invalid dtype for data variable: {} " "dtype must be a subtype of number, " - "a fixed sized string, a fixed size " - "unicode string or an object".format(var) + "datetime, bool, a fixed sized string, " + "a fixed size unicode string or an " + "object".format(var) ) for k in dataset.data_vars.values(): diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index d001c43da94..67d3b3198dc 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -90,6 +90,14 @@ def create_append_test_data(seed=None): string_var = np.array(["ae", "bc", "df"], dtype=object) string_var_to_append = np.array(["asdf", "asdfg"], dtype=object) unicode_var = ["áó", "áó", "áó"] + datetime_var = np.array( + ["2019-01-01", "2019-01-02", "2019-01-03"], dtype="datetime64[s]" + ) + datetime_var_to_append = np.array( + ["2019-01-04", "2019-01-05"], dtype="datetime64[s]" + ) + bool_var = np.array([True, False, True], dtype=np.bool) + bool_var_to_append = np.array([False, True], dtype=np.bool) ds = xr.Dataset( data_vars={ @@ -102,6 +110,8 @@ def create_append_test_data(seed=None): "unicode_var": xr.DataArray( unicode_var, coords=[time1], dims=["time"] ).astype(np.unicode_), + "datetime_var": xr.DataArray(datetime_var, coords=[time1], dims=["time"]), + "bool_var": xr.DataArray(bool_var, coords=[time1], dims=["time"]), } ) @@ -118,6 +128,10 @@ def create_append_test_data(seed=None): "unicode_var": xr.DataArray( unicode_var[:nt2], coords=[time2], dims=["time"] ).astype(np.unicode_), + "datetime_var": xr.DataArray( + datetime_var_to_append, coords=[time2], dims=["time"] + ), + "bool_var": xr.DataArray(bool_var_to_append, coords=[time2], dims=["time"]), } ) From 810345c4564a2bc15bf1b4c7ba4c4840238f1e82 Mon Sep 17 00:00:00 2001 From: Gina Date: Wed, 13 Nov 2019 14:18:14 -0600 Subject: [PATCH 12/22] FUNDING.yml (#3523) add NumFOCUS github sponsors button (recurring donations only) This feature launched today at GitHub Universe! Also add the custom link to point to the donation form for xarray. cc @shoyer --- .github/FUNDING.yml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .github/FUNDING.yml diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000000..30c1e18f33c --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,2 @@ +github: numfocus +custom: http://numfocus.org/donate-to-xarray From eece07932d5498a8abef6a8fbd30d00066931b18 Mon Sep 17 00:00:00 2001 From: Anderson Banihirwe Date: Wed, 13 Nov 2019 18:22:50 -0700 Subject: [PATCH 13/22] Harmonize `FillValue` and `missing_value` during encoding and decoding steps (#3502) * Replace `equivalent()` with `allclose_or_equiv()` * Ensure _FillValue & missing_value are cast to same dtype as data's * Use Numpy scalar during type casting * Update ValueError message * Formatting only * Update whats-new.rst --- doc/whats-new.rst | 2 ++ xarray/coding/variables.py | 14 ++++++++++---- xarray/tests/test_coding.py | 17 +++++++++++++++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ea3b012cc98..f840557ab5d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -79,6 +79,8 @@ New Features Bug fixes ~~~~~~~~~ +- Harmonize `_FillValue`, `missing_value` during encoding and decoding steps. (:pull:`3502`) + By `Anderson Banihirwe `_. - Fix regression introduced in v0.14.0 that would cause a crash if dask is installed but cloudpickle isn't (:issue:`3401`) by `Rhys Doyle `_ - Fix grouping over variables with NaNs. (:issue:`2383`, :pull:`3406`). diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index 5f9c8932b6b..2b5f87ab0cd 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -8,7 +8,6 @@ from ..core import dtypes, duck_array_ops, indexing from ..core.pycompat import dask_array_type -from ..core.utils import equivalent from ..core.variable import Variable @@ -152,18 +151,25 @@ def encode(self, variable, name=None): fv = encoding.get("_FillValue") mv = encoding.get("missing_value") - if fv is not None and mv is not None and not equivalent(fv, mv): + if ( + fv is not None + and mv is not None + and not duck_array_ops.allclose_or_equiv(fv, mv) + ): raise ValueError( - "Variable {!r} has multiple fill values {}. " - "Cannot encode data. ".format(name, [fv, mv]) + f"Variable {name!r} has conflicting _FillValue ({fv}) and missing_value ({mv}). Cannot encode data." ) if fv is not None: + # Ensure _FillValue is cast to same dtype as data's + encoding["_FillValue"] = data.dtype.type(fv) fill_value = pop_to(encoding, attrs, "_FillValue", name=name) if not pd.isnull(fill_value): data = duck_array_ops.fillna(data, fill_value) if mv is not None: + # Ensure missing_value is cast to same dtype as data's + encoding["missing_value"] = data.dtype.type(mv) fill_value = pop_to(encoding, attrs, "missing_value", name=name) if not pd.isnull(fill_value) and fv is None: data = duck_array_ops.fillna(data, fill_value) diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index 6cd584daa96..3e0474e7b60 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -20,6 +20,23 @@ def test_CFMaskCoder_decode(): assert_identical(expected, encoded) +def test_CFMaskCoder_encode_missing_fill_values_conflict(): + original = xr.Variable( + ("x",), + [0.0, -1.0, 1.0], + encoding={"_FillValue": np.float32(1e20), "missing_value": np.float64(1e20)}, + ) + coder = variables.CFMaskCoder() + encoded = coder.encode(original) + + assert encoded.dtype == encoded.attrs["missing_value"].dtype + assert encoded.dtype == encoded.attrs["_FillValue"].dtype + + with pytest.warns(variables.SerializationWarning): + roundtripped = coder.decode(coder.encode(original)) + assert_identical(roundtripped, original) + + def test_CFMaskCoder_missing_value(): expected = xr.DataArray( np.array([[26915, 27755, -9999, 27705], [25595, -9999, 28315, -9999]]), From 4358762d7ccf0d81dfbbc37d9c0665d53fe9c426 Mon Sep 17 00:00:00 2001 From: keewis Date: Thu, 14 Nov 2019 02:24:07 +0100 Subject: [PATCH 14/22] Tests for module-level functions with units (#3493) * add tests for replication functions * add tests for `xarray.dot` * add tests for apply_ufunc * explicitly set the test ids to repr * add tests for align * cover a bit more of align * add tests for broadcast * black changed how tuple unpacking should look like * correct the xfail message for full_like tests * add tests for where * add tests for concat * add tests for combine_by_coords * fix a bug in convert_units * convert the align results to the same units * rename the combine_by_coords test * convert the units for expected in combine_by_coords * add tests for combine_nested * add tests for merge with datasets * only use three datasets for merging * add tests for merge with dataarrays * update whats-new.rst --- doc/whats-new.rst | 3 +- xarray/tests/test_units.py | 871 ++++++++++++++++++++++++++++++++++++- 2 files changed, 865 insertions(+), 9 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index f840557ab5d..a7687368884 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -118,7 +118,8 @@ Internal Changes ~~~~~~~~~~~~~~~~ - Added integration tests against `pint `_. - (:pull:`3238`, :pull:`3447`, :pull:`3508`) by `Justus Magin `_. + (:pull:`3238`, :pull:`3447`, :pull:`3493`, :pull:`3508`) + by `Justus Magin `_. .. note:: diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index fd9e9b039ac..509a50d23ff 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -222,7 +222,9 @@ def convert_units(obj, to): if name != obj.name } - new_obj = xr.DataArray(name=name, data=data, coords=coords, attrs=obj.attrs) + new_obj = xr.DataArray( + name=name, data=data, coords=coords, attrs=obj.attrs, dims=obj.dims + ) elif isinstance(obj, unit_registry.Quantity): units = to.get(None) new_obj = obj.to(units) if units is not None else obj @@ -307,19 +309,689 @@ def __repr__(self): class function: - def __init__(self, name): - self.name = name - self.func = getattr(np, name) + def __init__(self, name_or_function, *args, **kwargs): + if callable(name_or_function): + self.name = name_or_function.__name__ + self.func = name_or_function + else: + self.name = name_or_function + self.func = getattr(np, name_or_function) + if self.func is None: + raise AttributeError( + f"module 'numpy' has no attribute named '{self.name}'" + ) + + self.args = args + self.kwargs = kwargs def __call__(self, *args, **kwargs): - return self.func(*args, **kwargs) + all_args = list(self.args) + list(args) + all_kwargs = {**self.kwargs, **kwargs} + + return self.func(*all_args, **all_kwargs) def __repr__(self): return f"function_{self.name}" +def test_apply_ufunc_dataarray(dtype): + func = function( + xr.apply_ufunc, np.mean, input_core_dims=[["x"]], kwargs={"axis": -1} + ) + + array = np.linspace(0, 10, 20).astype(dtype) * unit_registry.m + x = np.arange(20) * unit_registry.s + data_array = xr.DataArray(data=array, dims="x", coords={"x": x}) + + expected = attach_units(func(strip_units(data_array)), extract_units(data_array)) + result = func(data_array) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail( + reason="pint does not implement `np.result_type` and align strips units" +) +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + "coords", + ), +) +@pytest.mark.parametrize("fill_value", (np.float64(10), np.float64(np.nan))) +def test_align_dataarray(fill_value, variant, unit, error, dtype): + original_unit = unit_registry.m + + variants = { + "data": (unit, original_unit, original_unit), + "dims": (original_unit, unit, original_unit), + "coords": (original_unit, original_unit, unit), + } + data_unit, dim_unit, coord_unit = variants.get(variant) + + array1 = np.linspace(0, 10, 2 * 5).reshape(2, 5).astype(dtype) * original_unit + array2 = np.linspace(0, 8, 2 * 5).reshape(2, 5).astype(dtype) * data_unit + x = np.arange(2) * original_unit + x_a1 = np.array([10, 5]) * original_unit + x_a2 = np.array([10, 5]) * coord_unit + + y1 = np.arange(5) * original_unit + y2 = np.arange(2, 7) * dim_unit + + data_array1 = xr.DataArray( + data=array1, coords={"x": x, "x_a": ("x", x_a1), "y": y1}, dims=("x", "y") + ) + data_array2 = xr.DataArray( + data=array2, coords={"x": x, "x_a": ("x", x_a2), "y": y2}, dims=("x", "y") + ) + + fill_value = fill_value * data_unit + func = function(xr.align, join="outer", fill_value=fill_value) + if error is not None: + with pytest.raises(error): + func(data_array1, data_array2) + + return + + stripped_kwargs = { + key: strip_units( + convert_units(value, {None: original_unit}) + if isinstance(value, unit_registry.Quantity) + else value + ) + for key, value in func.kwargs.items() + } + units = extract_units(data_array1) + # FIXME: should the expected_b have the same units as data_array1 + # or data_array2? + expected_a, expected_b = tuple( + attach_units(elem, units) + for elem in func( + strip_units(data_array1), + strip_units(convert_units(data_array2, units)), + **stripped_kwargs, + ) + ) + result_a, result_b = func(data_array1, data_array2) + + assert_equal_with_units(expected_a, result_a) + assert_equal_with_units(expected_b, result_b) + + +@pytest.mark.xfail( + reason="pint does not implement `np.result_type` and align strips units" +) +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + "coords", + ), +) +@pytest.mark.parametrize("fill_value", (np.float64(10), np.float64(np.nan))) +def test_align_dataset(fill_value, unit, variant, error, dtype): + original_unit = unit_registry.m + + variants = { + "data": (unit, original_unit, original_unit), + "dims": (original_unit, unit, original_unit), + "coords": (original_unit, original_unit, unit), + } + data_unit, dim_unit, coord_unit = variants.get(variant) + + array1 = np.linspace(0, 10, 2 * 5).reshape(2, 5).astype(dtype) * original_unit + array2 = np.linspace(0, 10, 2 * 5).reshape(2, 5).astype(dtype) * data_unit + + x = np.arange(2) * original_unit + x_a1 = np.array([10, 5]) * original_unit + x_a2 = np.array([10, 5]) * coord_unit + + y1 = np.arange(5) * original_unit + y2 = np.arange(2, 7) * dim_unit + + ds1 = xr.Dataset( + data_vars={"a": (("x", "y"), array1)}, + coords={"x": x, "x_a": ("x", x_a1), "y": y1}, + ) + ds2 = xr.Dataset( + data_vars={"a": (("x", "y"), array2)}, + coords={"x": x, "x_a": ("x", x_a2), "y": y2}, + ) + + fill_value = fill_value * data_unit + func = function(xr.align, join="outer", fill_value=fill_value) + if error is not None: + with pytest.raises(error): + func(ds1, ds2) + + return + + stripped_kwargs = { + key: strip_units( + convert_units(value, {None: original_unit}) + if isinstance(value, unit_registry.Quantity) + else value + ) + for key, value in func.kwargs.items() + } + units = extract_units(ds1) + # FIXME: should the expected_b have the same units as ds1 or ds2? + expected_a, expected_b = tuple( + attach_units(elem, units) + for elem in func( + strip_units(ds1), strip_units(convert_units(ds2, units)), **stripped_kwargs + ) + ) + result_a, result_b = func(ds1, ds2) + + assert_equal_with_units(expected_a, result_a) + assert_equal_with_units(expected_b, result_b) + + +def test_broadcast_dataarray(dtype): + array1 = np.linspace(0, 10, 2) * unit_registry.Pa + array2 = np.linspace(0, 10, 3) * unit_registry.Pa + + a = xr.DataArray(data=array1, dims="x") + b = xr.DataArray(data=array2, dims="y") + + expected_a, expected_b = tuple( + attach_units(elem, extract_units(a)) + for elem in xr.broadcast(strip_units(a), strip_units(b)) + ) + result_a, result_b = xr.broadcast(a, b) + + assert_equal_with_units(expected_a, result_a) + assert_equal_with_units(expected_b, result_b) + + +def test_broadcast_dataset(dtype): + array1 = np.linspace(0, 10, 2) * unit_registry.Pa + array2 = np.linspace(0, 10, 3) * unit_registry.Pa + + ds = xr.Dataset(data_vars={"a": ("x", array1), "b": ("y", array2)}) + + (expected,) = tuple( + attach_units(elem, extract_units(ds)) for elem in xr.broadcast(strip_units(ds)) + ) + (result,) = xr.broadcast(ds) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="`combine_by_coords` strips units") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + "coords", + ), +) +def test_combine_by_coords(variant, unit, error, dtype): + original_unit = unit_registry.m + + variants = { + "data": (unit, original_unit, original_unit), + "dims": (original_unit, unit, original_unit), + "coords": (original_unit, original_unit, unit), + } + data_unit, dim_unit, coord_unit = variants.get(variant) + + array1 = np.zeros(shape=(2, 3), dtype=dtype) * original_unit + array2 = np.zeros(shape=(2, 3), dtype=dtype) * original_unit + x = np.arange(1, 4) * 10 * original_unit + y = np.arange(2) * original_unit + z = np.arange(3) * original_unit + + other_array1 = np.ones_like(array1) * data_unit + other_array2 = np.ones_like(array2) * data_unit + other_x = np.arange(1, 4) * 10 * dim_unit + other_y = np.arange(2, 4) * dim_unit + other_z = np.arange(3, 6) * coord_unit + + ds = xr.Dataset( + data_vars={"a": (("y", "x"), array1), "b": (("y", "x"), array2)}, + coords={"x": x, "y": y, "z": ("x", z)}, + ) + other = xr.Dataset( + data_vars={"a": (("y", "x"), other_array1), "b": (("y", "x"), other_array2)}, + coords={"x": other_x, "y": other_y, "z": ("x", other_z)}, + ) + + if error is not None: + with pytest.raises(error): + xr.combine_by_coords([ds, other]) + + return + + units = extract_units(ds) + expected = attach_units( + xr.combine_by_coords( + [strip_units(ds), strip_units(convert_units(other, units))] + ), + units, + ) + result = xr.combine_by_coords([ds, other]) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="blocked by `where`") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + "coords", + ), +) +def test_combine_nested(variant, unit, error, dtype): + original_unit = unit_registry.m + + variants = { + "data": (unit, original_unit, original_unit), + "dims": (original_unit, unit, original_unit), + "coords": (original_unit, original_unit, unit), + } + data_unit, dim_unit, coord_unit = variants.get(variant) + + array1 = np.zeros(shape=(2, 3), dtype=dtype) * original_unit + array2 = np.zeros(shape=(2, 3), dtype=dtype) * original_unit + + x = np.arange(1, 4) * 10 * original_unit + y = np.arange(2) * original_unit + z = np.arange(3) * original_unit + + ds1 = xr.Dataset( + data_vars={"a": (("y", "x"), array1), "b": (("y", "x"), array2)}, + coords={"x": x, "y": y, "z": ("x", z)}, + ) + ds2 = xr.Dataset( + data_vars={ + "a": (("y", "x"), np.ones_like(array1) * data_unit), + "b": (("y", "x"), np.ones_like(array2) * data_unit), + }, + coords={ + "x": np.arange(3) * dim_unit, + "y": np.arange(2, 4) * dim_unit, + "z": ("x", np.arange(-3, 0) * coord_unit), + }, + ) + ds3 = xr.Dataset( + data_vars={ + "a": (("y", "x"), np.zeros_like(array1) * np.nan * data_unit), + "b": (("y", "x"), np.zeros_like(array2) * np.nan * data_unit), + }, + coords={ + "x": np.arange(3, 6) * dim_unit, + "y": np.arange(4, 6) * dim_unit, + "z": ("x", np.arange(3, 6) * coord_unit), + }, + ) + ds4 = xr.Dataset( + data_vars={ + "a": (("y", "x"), -1 * np.ones_like(array1) * data_unit), + "b": (("y", "x"), -1 * np.ones_like(array2) * data_unit), + }, + coords={ + "x": np.arange(6, 9) * dim_unit, + "y": np.arange(6, 8) * dim_unit, + "z": ("x", np.arange(6, 9) * coord_unit), + }, + ) + + func = function(xr.combine_nested, concat_dim=["x", "y"]) + if error is not None: + with pytest.raises(error): + func([[ds1, ds2], [ds3, ds4]]) + + return + + units = extract_units(ds1) + convert_and_strip = lambda ds: strip_units(convert_units(ds, units)) + expected = attach_units( + func( + [ + [strip_units(ds1), convert_and_strip(ds2)], + [convert_and_strip(ds3), convert_and_strip(ds4)], + ] + ), + units, + ) + result = func([[ds1, ds2], [ds3, ds4]]) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="`concat` strips units") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + ), +) +def test_concat_dataarray(variant, unit, error, dtype): + original_unit = unit_registry.m + + variants = {"data": (unit, original_unit), "dims": (original_unit, unit)} + data_unit, dims_unit = variants.get(variant) + + array1 = np.linspace(0, 5, 10).astype(dtype) * unit_registry.m + array2 = np.linspace(-5, 0, 5).astype(dtype) * data_unit + x1 = np.arange(5, 15) * original_unit + x2 = np.arange(5) * dims_unit + + arr1 = xr.DataArray(data=array1, coords={"x": x1}, dims="x") + arr2 = xr.DataArray(data=array2, coords={"x": x2}, dims="x") + + if error is not None: + with pytest.raises(error): + xr.concat([arr1, arr2], dim="x") + + return + + expected = attach_units( + xr.concat([strip_units(arr1), strip_units(arr2)], dim="x"), extract_units(arr1) + ) + result = xr.concat([arr1, arr2], dim="x") + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="`concat` strips units") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + ), +) +def test_concat_dataset(variant, unit, error, dtype): + original_unit = unit_registry.m + + variants = {"data": (unit, original_unit), "dims": (original_unit, unit)} + data_unit, dims_unit = variants.get(variant) + + array1 = np.linspace(0, 5, 10).astype(dtype) * unit_registry.m + array2 = np.linspace(-5, 0, 5).astype(dtype) * data_unit + x1 = np.arange(5, 15) * original_unit + x2 = np.arange(5) * dims_unit + + ds1 = xr.Dataset(data_vars={"a": ("x", array1)}, coords={"x": x1}) + ds2 = xr.Dataset(data_vars={"a": ("x", array2)}, coords={"x": x2}) + + if error is not None: + with pytest.raises(error): + xr.concat([ds1, ds2], dim="x") + + return + + expected = attach_units( + xr.concat([strip_units(ds1), strip_units(ds2)], dim="x"), extract_units(ds1) + ) + result = xr.concat([ds1, ds2], dim="x") + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="blocked by `where`") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + "coords", + ), +) +def test_merge_dataarray(variant, unit, error, dtype): + original_unit = unit_registry.m + + variants = { + "data": (unit, original_unit, original_unit), + "dims": (original_unit, unit, original_unit), + "coords": (original_unit, original_unit, unit), + } + data_unit, dim_unit, coord_unit = variants.get(variant) + + array1 = np.linspace(0, 1, 2 * 3).reshape(2, 3).astype(dtype) * original_unit + array2 = np.linspace(1, 2, 2 * 4).reshape(2, 4).astype(dtype) * data_unit + array3 = np.linspace(0, 2, 3 * 4).reshape(3, 4).astype(dtype) * data_unit + + x = np.arange(2) * original_unit + y = np.arange(3) * original_unit + z = np.arange(4) * original_unit + u = np.linspace(10, 20, 2) * original_unit + v = np.linspace(10, 20, 3) * original_unit + w = np.linspace(10, 20, 4) * original_unit + + arr1 = xr.DataArray( + name="a", + data=array1, + coords={"x": x, "y": y, "u": ("x", u), "v": ("y", v)}, + dims=("x", "y"), + ) + arr2 = xr.DataArray( + name="b", + data=array2, + coords={ + "x": np.arange(2, 4) * dim_unit, + "z": z, + "u": ("x", np.linspace(20, 30, 2) * coord_unit), + "w": ("z", w), + }, + dims=("x", "z"), + ) + arr3 = xr.DataArray( + name="c", + data=array3, + coords={ + "y": np.arange(3, 6) * dim_unit, + "z": np.arange(4, 8) * dim_unit, + "v": ("y", np.linspace(10, 20, 3) * coord_unit), + "w": ("z", np.linspace(10, 20, 4) * coord_unit), + }, + dims=("y", "z"), + ) + + func = function(xr.merge) + if error is not None: + with pytest.raises(error): + func([arr1, arr2, arr3]) + + return + + units = {name: original_unit for name in list("abcuvwxyz")} + convert_and_strip = lambda arr: strip_units(convert_units(arr, units)) + expected = attach_units( + func([strip_units(arr1), convert_and_strip(arr2), convert_and_strip(arr3)]), + units, + ) + result = func([arr1, arr2, arr3]) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="blocked by `where`") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize( + "variant", + ( + "data", + pytest.param("dims", marks=pytest.mark.xfail(reason="indexes strip units")), + "coords", + ), +) +def test_merge_dataset(variant, unit, error, dtype): + original_unit = unit_registry.m + + variants = { + "data": (unit, original_unit, original_unit), + "dims": (original_unit, unit, original_unit), + "coords": (original_unit, original_unit, unit), + } + data_unit, dim_unit, coord_unit = variants.get(variant) + + array1 = np.zeros(shape=(2, 3), dtype=dtype) * original_unit + array2 = np.zeros(shape=(2, 3), dtype=dtype) * original_unit + + x = np.arange(11, 14) * original_unit + y = np.arange(2) * original_unit + z = np.arange(3) * original_unit + + ds1 = xr.Dataset( + data_vars={"a": (("y", "x"), array1), "b": (("y", "x"), array2)}, + coords={"x": x, "y": y, "z": ("x", z)}, + ) + ds2 = xr.Dataset( + data_vars={ + "a": (("y", "x"), np.ones_like(array1) * data_unit), + "b": (("y", "x"), np.ones_like(array2) * data_unit), + }, + coords={ + "x": np.arange(3) * dim_unit, + "y": np.arange(2, 4) * dim_unit, + "z": ("x", np.arange(-3, 0) * coord_unit), + }, + ) + ds3 = xr.Dataset( + data_vars={ + "a": (("y", "x"), np.zeros_like(array1) * np.nan * data_unit), + "b": (("y", "x"), np.zeros_like(array2) * np.nan * data_unit), + }, + coords={ + "x": np.arange(3, 6) * dim_unit, + "y": np.arange(4, 6) * dim_unit, + "z": ("x", np.arange(3, 6) * coord_unit), + }, + ) + + func = function(xr.merge) + if error is not None: + with pytest.raises(error): + func([ds1, ds2, ds3]) + + return + + units = extract_units(ds1) + convert_and_strip = lambda ds: strip_units(convert_units(ds, units)) + expected = attach_units( + func([strip_units(ds1), convert_and_strip(ds2), convert_and_strip(ds3)]), units + ) + result = func([ds1, ds2, ds3]) + + assert_equal_with_units(expected, result) + + @pytest.mark.parametrize("func", (xr.zeros_like, xr.ones_like)) -def test_replication(func, dtype): +def test_replication_dataarray(func, dtype): array = np.linspace(0, 10, 20).astype(dtype) * unit_registry.s data_array = xr.DataArray(data=array, dims="x") @@ -330,8 +1002,33 @@ def test_replication(func, dtype): assert_equal_with_units(expected, result) +@pytest.mark.parametrize("func", (xr.zeros_like, xr.ones_like)) +def test_replication_dataset(func, dtype): + array1 = np.linspace(0, 10, 20).astype(dtype) * unit_registry.s + array2 = np.linspace(5, 10, 10).astype(dtype) * unit_registry.Pa + x = np.arange(20).astype(dtype) * unit_registry.m + y = np.arange(10).astype(dtype) * unit_registry.m + z = y.to(unit_registry.mm) + + ds = xr.Dataset( + data_vars={"a": ("x", array1), "b": ("y", array2)}, + coords={"x": x, "y": y, "z": ("y", z)}, + ) + + numpy_func = getattr(np, func.__name__) + expected = ds.copy( + data={name: numpy_func(array.data) for name, array in ds.data_vars.items()} + ) + result = func(ds) + + assert_equal_with_units(expected, result) + + @pytest.mark.xfail( - reason="np.full_like on Variable strips the unit and pint does not allow mixed args" + reason=( + "pint is undecided on how `full_like` should work, so incorrect errors " + "may be expected: hgrecco/pint#882" + ) ) @pytest.mark.parametrize( "unit,error", @@ -344,8 +1041,9 @@ def test_replication(func, dtype): pytest.param(unit_registry.ms, None, id="compatible_unit"), pytest.param(unit_registry.s, None, id="identical_unit"), ), + ids=repr, ) -def test_replication_full_like(unit, error, dtype): +def test_replication_full_like_dataarray(unit, error, dtype): array = np.linspace(0, 5, 10) * unit_registry.s data_array = xr.DataArray(data=array, dims="x") @@ -360,6 +1058,163 @@ def test_replication_full_like(unit, error, dtype): assert_equal_with_units(expected, result) +@pytest.mark.xfail( + reason=( + "pint is undecided on how `full_like` should work, so incorrect errors " + "may be expected: hgrecco/pint#882" + ) +) +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.m, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.ms, None, id="compatible_unit"), + pytest.param(unit_registry.s, None, id="identical_unit"), + ), + ids=repr, +) +def test_replication_full_like_dataset(unit, error, dtype): + array1 = np.linspace(0, 10, 20).astype(dtype) * unit_registry.s + array2 = np.linspace(5, 10, 10).astype(dtype) * unit_registry.Pa + x = np.arange(20).astype(dtype) * unit_registry.m + y = np.arange(10).astype(dtype) * unit_registry.m + z = y.to(unit_registry.mm) + + ds = xr.Dataset( + data_vars={"a": ("x", array1), "b": ("y", array2)}, + coords={"x": x, "y": y, "z": ("y", z)}, + ) + + fill_value = -1 * unit + if error is not None: + with pytest.raises(error): + xr.full_like(ds, fill_value=fill_value) + + return + + expected = ds.copy( + data={ + name: np.full_like(array, fill_value=fill_value) + for name, array in ds.data_vars.items() + } + ) + result = xr.full_like(ds, fill_value=fill_value) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="`where` strips units") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize("fill_value", (np.nan, 10.2)) +def test_where_dataarray(fill_value, unit, error, dtype): + array = np.linspace(0, 5, 10).astype(dtype) * unit_registry.m + + x = xr.DataArray(data=array, dims="x") + cond = x < 5 * unit_registry.m + # FIXME: this should work without wrapping in array() + fill_value = np.array(fill_value) * unit + + if error is not None: + with pytest.raises(error): + xr.where(cond, x, fill_value) + + return + + fill_value_ = ( + fill_value.to(unit_registry.m) + if isinstance(fill_value, unit_registry.Quantity) + and fill_value.check(unit_registry.m) + else fill_value + ) + expected = attach_units( + xr.where(cond, strip_units(x), strip_units(fill_value_)), extract_units(x) + ) + result = xr.where(cond, x, fill_value) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="`where` strips units") +@pytest.mark.parametrize( + "unit,error", + ( + pytest.param(1, DimensionalityError, id="no_unit"), + pytest.param( + unit_registry.dimensionless, DimensionalityError, id="dimensionless" + ), + pytest.param(unit_registry.s, DimensionalityError, id="incompatible_unit"), + pytest.param(unit_registry.mm, None, id="compatible_unit"), + pytest.param(unit_registry.m, None, id="identical_unit"), + ), + ids=repr, +) +@pytest.mark.parametrize("fill_value", (np.nan, 10.2)) +def test_where_dataset(fill_value, unit, error, dtype): + array1 = np.linspace(0, 5, 10).astype(dtype) * unit_registry.m + array2 = np.linspace(-5, 0, 10).astype(dtype) * unit_registry.m + x = np.arange(10) * unit_registry.s + + ds = xr.Dataset(data_vars={"a": ("x", array1), "b": ("x", array2)}, coords={"x": x}) + cond = ds.x < 5 * unit_registry.s + # FIXME: this should work without wrapping in array() + fill_value = np.array(fill_value) * unit + + if error is not None: + with pytest.raises(error): + xr.where(cond, ds, fill_value) + + return + + fill_value_ = ( + fill_value.to(unit_registry.m) + if isinstance(fill_value, unit_registry.Quantity) + and fill_value.check(unit_registry.m) + else fill_value + ) + expected = attach_units( + xr.where(cond, strip_units(ds), strip_units(fill_value_)), extract_units(ds) + ) + result = xr.where(cond, ds, fill_value) + + assert_equal_with_units(expected, result) + + +@pytest.mark.xfail(reason="pint does not implement `np.einsum`") +def test_dot_dataarray(dtype): + array1 = ( + np.linspace(0, 10, 5 * 10).reshape(5, 10).astype(dtype) + * unit_registry.m + / unit_registry.s + ) + array2 = ( + np.linspace(10, 20, 10 * 20).reshape(10, 20).astype(dtype) * unit_registry.s + ) + + arr1 = xr.DataArray(data=array1, dims=("x", "y")) + arr2 = xr.DataArray(data=array2, dims=("y", "z")) + + expected = array1.dot(array2) + result = xr.dot(arr1, arr2) + + assert_equal_with_units(expected, result) + + class TestDataArray: @pytest.mark.filterwarnings("error:::pint[.*]") @pytest.mark.parametrize( From 8b240376fd91352a80b068af606850e8d57d1090 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Wed, 13 Nov 2019 22:56:59 -0500 Subject: [PATCH 15/22] add Variable._replace (#3528) * add Variable._replace * assertions * whatsew * whatsnew --- doc/whats-new.rst | 3 +++ xarray/core/variable.py | 19 +++++++++++++++++-- xarray/tests/test_variable.py | 9 +++++++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a7687368884..b8fb1f8f58e 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -138,6 +138,9 @@ Internal Changes - Enable type checking on default sentinel values (:pull:`3472`) By `Maximilian Roos `_ +- Add :py:meth:`Variable._replace` for simpler replacing of a subset of attributes (:pull:`3472`) + By `Maximilian Roos `_ + .. _whats-new.0.14.0: v0.14.0 (14 Oct 2019) diff --git a/xarray/core/variable.py b/xarray/core/variable.py index cf97c997017..e630dc4b457 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1,3 +1,4 @@ +import copy import functools import itertools import warnings @@ -24,10 +25,11 @@ from .pycompat import dask_array_type, integer_types from .utils import ( OrderedSet, + _default, decode_numpy_dict_values, either_dict_or_kwargs, - infix_dims, ensure_us_time_resolution, + infix_dims, ) try: @@ -887,7 +889,20 @@ def copy(self, deep=True, data=None): # note: # dims is already an immutable tuple # attributes and encoding will be copied when the new Array is created - return type(self)(self.dims, data, self._attrs, self._encoding, fastpath=True) + return self._replace(data=data) + + def _replace( + self, dims=_default, data=_default, attrs=_default, encoding=_default + ) -> "Variable": + if dims is _default: + dims = copy.copy(self._dims) + if data is _default: + data = copy.copy(self.data) + if attrs is _default: + attrs = copy.copy(self._attrs) + if encoding is _default: + encoding = copy.copy(self._encoding) + return type(self)(dims, data, attrs, encoding, fastpath=True) def __copy__(self): return self.copy(deep=False) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index d394919dbdd..d92a68729b5 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -542,6 +542,15 @@ def test_copy_index_with_data_errors(self): with raises_regex(ValueError, "must match shape of object"): orig.copy(data=new_data) + def test_replace(self): + var = Variable(("x", "y"), [[1.5, 2.0], [3.1, 4.3]], {"foo": "bar"}) + result = var._replace() + assert_identical(result, var) + + new_data = np.arange(4).reshape(2, 2) + result = var._replace(data=new_data) + assert_array_equal(result.data, new_data) + def test_real_and_imag(self): v = self.cls("x", np.arange(3) - 1j * np.arange(3), {"foo": "bar"}) expected_re = self.cls("x", np.arange(3), {"foo": "bar"}) From c0ef2f616e87e9f924425bcd373ac265f14203cb Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Thu, 14 Nov 2019 20:56:17 +0900 Subject: [PATCH 16/22] Fix set_index when an existing dimension becomes a level (#3520) * Added a test * Fix set_index * lint * black / mypy * Use _replace method * whats new --- doc/whats-new.rst | 2 ++ xarray/core/dataarray.py | 10 +++++----- xarray/core/dataset.py | 12 ++++++++++-- xarray/tests/test_dataarray.py | 10 ++++++++++ 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index b8fb1f8f58e..abd94779435 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -79,6 +79,8 @@ New Features Bug fixes ~~~~~~~~~ +- Fix a bug in `set_index` in case that an existing dimension becomes a level variable of MultiIndex. (:pull:`3520`) + By `Keisuke Fujii `_. - Harmonize `_FillValue`, `missing_value` during encoding and decoding steps. (:pull:`3502`) By `Anderson Banihirwe `_. - Fix regression introduced in v0.14.0 that would cause a crash if dask is installed diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index a192fe08cee..55e73478260 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -48,7 +48,7 @@ assert_coordinate_consistent, remap_label_indexers, ) -from .dataset import Dataset, merge_indexes, split_indexes +from .dataset import Dataset, split_indexes from .formatting import format_item from .indexes import Indexes, default_indexes from .merge import PANDAS_TYPES @@ -1601,10 +1601,10 @@ def set_index( -------- DataArray.reset_index """ - _check_inplace(inplace) - indexes = either_dict_or_kwargs(indexes, indexes_kwargs, "set_index") - coords, _ = merge_indexes(indexes, self._coords, set(), append=append) - return self._replace(coords=coords) + ds = self._to_temp_dataset().set_index( + indexes, append=append, inplace=inplace, **indexes_kwargs + ) + return self._from_temp_dataset(ds) def reset_index( self, diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 15a7209ab24..de713b830f2 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -204,6 +204,7 @@ def merge_indexes( """ vars_to_replace: Dict[Hashable, Variable] = {} vars_to_remove: List[Hashable] = [] + dims_to_replace: Dict[Hashable, Hashable] = {} error_msg = "{} is not the name of an existing variable." for dim, var_names in indexes.items(): @@ -244,7 +245,7 @@ def merge_indexes( if not len(names) and len(var_names) == 1: idx = pd.Index(variables[var_names[0]].values) - else: + else: # MultiIndex for n in var_names: try: var = variables[n] @@ -256,15 +257,22 @@ def merge_indexes( levels.append(cat.categories) idx = pd.MultiIndex(levels, codes, names=names) + for n in names: + dims_to_replace[n] = dim vars_to_replace[dim] = IndexVariable(dim, idx) vars_to_remove.extend(var_names) new_variables = {k: v for k, v in variables.items() if k not in vars_to_remove} new_variables.update(vars_to_replace) + + # update dimensions if necessary GH: 3512 + for k, v in new_variables.items(): + if any(d in dims_to_replace for d in v.dims): + new_dims = [dims_to_replace.get(d, d) for d in v.dims] + new_variables[k] = v._replace(dims=new_dims) new_coord_names = coord_names | set(vars_to_replace) new_coord_names -= set(vars_to_remove) - return new_variables, new_coord_names diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 7c6dc1825a1..4c3553c867e 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -1182,6 +1182,16 @@ def test_selection_multiindex_remove_unused(self): expected = expected.set_index(xy=["x", "y"]).unstack() assert_identical(expected, actual) + def test_selection_multiindex_from_level(self): + # GH: 3512 + da = DataArray([0, 1], dims=["x"], coords={"x": [0, 1], "y": "a"}) + db = DataArray([2, 3], dims=["x"], coords={"x": [0, 1], "y": "b"}) + data = xr.concat([da, db], dim="x").set_index(xy=["x", "y"]) + assert data.dims == ("xy",) + actual = data.sel(y="a") + expected = data.isel(xy=[0, 1]).unstack("xy").squeeze("y").drop("y") + assert_equal(actual, expected) + def test_virtual_default_coords(self): array = DataArray(np.zeros((5,)), dims="x") expected = DataArray(range(5), dims="x", name="x") From 7b4a286f59bc7d60d4e4d03be65562ff63f9b111 Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Thu, 14 Nov 2019 11:56:49 -0500 Subject: [PATCH 17/22] units & deprecation merge (#3530) --- xarray/tests/test_units.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index 509a50d23ff..0be6f8af464 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -1969,7 +1969,7 @@ def test_broadcast_equals(self, unit, dtype): dim={"z": np.linspace(10, 20, 12) * unit_registry.s}, axis=1, ), - method("drop", labels="x"), + method("drop_sel", labels="x"), method("reset_coords", names="x2"), method("copy"), pytest.param( @@ -4045,7 +4045,7 @@ def test_reindex_like(self, unit, error, dtype): marks=pytest.mark.xfail(reason="strips units"), ), pytest.param( - method("apply", np.fabs), + method("map", np.fabs), marks=pytest.mark.xfail(reason="fabs strips units"), ), ), @@ -4220,7 +4220,7 @@ def test_grouped_operations(self, func, dtype): method("rename_dims", x="offset_x"), method("swap_dims", {"x": "x2"}), method("expand_dims", v=np.linspace(10, 20, 12) * unit_registry.s, axis=1), - method("drop", labels="x"), + method("drop_sel", labels="x"), method("drop_dims", "z"), method("set_coords", names="c"), method("reset_coords", names="x2"), From ee9da17ef04035cf318b6f1a4bb413f3d10ae614 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Fri, 15 Nov 2019 14:53:16 +0000 Subject: [PATCH 18/22] interpolate_na: Add max_gap support. (#3302) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * interpolate_na: Add maxgap support. * Add docs. * Add requires_bottleneck to test. * Review comments. * Update xarray/core/dataarray.py Co-Authored-By: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> * Update xarray/core/dataset.py Co-Authored-By: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> * maxgap → max_gap * update whats-new * update computation.rst * Better support uniformly spaced coordinates. Split legnths, interp test * Raise error for max_gap and irregularly spaced coordinates + test * rework. * Use pandas checks for index duplication and monotonicity. * Progress + add datetime. * nicer error message * A few fstrings. * finish up timedelta max_gap. * fix whats-new * small fixes. * fix dan's test. * remove redundant test. * nicer error message. * Add xfailed cftime tests * better error checking and tests. * typing. * update docstrings * scipy intersphinx * fix tests * add bottleneck testing decorator. --- doc/computation.rst | 3 + doc/conf.py | 11 +-- doc/whats-new.rst | 4 ++ xarray/core/dataarray.py | 58 +++++++++++----- xarray/core/dataset.py | 60 +++++++++++----- xarray/core/missing.py | 110 +++++++++++++++++++++++++---- xarray/tests/test_missing.py | 130 ++++++++++++++++++++++++++++++++++- 7 files changed, 322 insertions(+), 54 deletions(-) diff --git a/doc/computation.rst b/doc/computation.rst index 663c546be20..240a1e5704b 100644 --- a/doc/computation.rst +++ b/doc/computation.rst @@ -95,6 +95,9 @@ for filling missing values via 1D interpolation. Note that xarray slightly diverges from the pandas ``interpolate`` syntax by providing the ``use_coordinate`` keyword which facilitates a clear specification of which values to use as the index in the interpolation. +xarray also provides the ``max_gap`` keyword argument to limit the interpolation to +data gaps of length ``max_gap`` or smaller. See :py:meth:`~xarray.DataArray.interpolate_na` +for more. Aggregation =========== diff --git a/doc/conf.py b/doc/conf.py index 7c1557a1e66..0e04f8ccde8 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -340,9 +340,10 @@ # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), - "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None), - "iris": ("http://scitools.org.uk/iris/docs/latest/", None), - "numpy": ("https://docs.scipy.org/doc/numpy/", None), - "numba": ("https://numba.pydata.org/numba-doc/latest/", None), - "matplotlib": ("https://matplotlib.org/", None), + "pandas": ("https://pandas.pydata.org/pandas-docs/stable", None), + "iris": ("https://scitools.org.uk/iris/docs/latest", None), + "numpy": ("https://docs.scipy.org/doc/numpy", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference", None), + "numba": ("https://numba.pydata.org/numba-doc/latest", None), + "matplotlib": ("https://matplotlib.org", None), } diff --git a/doc/whats-new.rst b/doc/whats-new.rst index abd94779435..053f785bc05 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -38,6 +38,10 @@ Breaking changes New Features ~~~~~~~~~~~~ + +- Added the ``max_gap`` kwarg to :py:meth:`~xarray.DataArray.interpolate_na` and + :py:meth:`~xarray.Dataset.interpolate_na`. This controls the maximum size of the data + gap that will be filled by interpolation. By `Deepak Cherian `_. - :py:meth:`Dataset.drop_sel` & :py:meth:`DataArray.drop_sel` have been added for dropping labels. :py:meth:`Dataset.drop_vars` & :py:meth:`DataArray.drop_vars` have been added for dropping variables (including coordinates). The existing ``drop`` methods remain as a backward compatible diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 55e73478260..7ce775b49cd 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -2018,44 +2018,69 @@ def fillna(self, value: Any) -> "DataArray": def interpolate_na( self, - dim=None, + dim: Hashable = None, method: str = "linear", limit: int = None, use_coordinate: Union[bool, str] = True, + max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None, **kwargs: Any, ) -> "DataArray": - """Interpolate values according to different methods. + """Fill in NaNs by interpolating according to different methods. Parameters ---------- dim : str Specifies the dimension along which to interpolate. - method : {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'polynomial', 'barycentric', 'krog', 'pchip', - 'spline', 'akima'}, optional + method : str, optional String indicating which method to use for interpolation: - 'linear': linear interpolation (Default). Additional keyword - arguments are passed to ``numpy.interp`` - - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'polynomial': are passed to ``scipy.interpolate.interp1d``. If - method=='polynomial', the ``order`` keyword argument must also be + arguments are passed to :py:func:`numpy.interp` + - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'polynomial': + are passed to :py:func:`scipy.interpolate.interp1d`. If + ``method='polynomial'``, the ``order`` keyword argument must also be provided. - - 'barycentric', 'krog', 'pchip', 'spline', and `akima`: use their - respective``scipy.interpolate`` classes. - use_coordinate : boolean or str, default True + - 'barycentric', 'krog', 'pchip', 'spline', 'akima': use their + respective :py:class:`scipy.interpolate` classes. + use_coordinate : bool, str, default True Specifies which index to use as the x values in the interpolation formulated as `y = f(x)`. If False, values are treated as if - eqaully-spaced along `dim`. If True, the IndexVariable `dim` is - used. If use_coordinate is a string, it specifies the name of a + eqaully-spaced along ``dim``. If True, the IndexVariable `dim` is + used. If ``use_coordinate`` is a string, it specifies the name of a coordinate variariable to use as the index. limit : int, default None Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. + or None for no limit. This filling is done regardless of the size of + the gap in the data. To only interpolate over gaps less than a given length, + see ``max_gap``. + max_gap: int, float, str, pandas.Timedelta, numpy.timedelta64, default None. + Maximum size of gap, a continuous sequence of NaNs, that will be filled. + Use None for no limit. When interpolating along a datetime64 dimension + and ``use_coordinate=True``, ``max_gap`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled + dimensions has not been implemented yet. Gap length is defined as the difference + between coordinate values at the first data point after a gap and the last value + before a gap. For gaps at the beginning (end), gap length is defined as the difference + between coordinate values at the first (last) valid data point and the first (last) NaN. + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + kwargs : dict, optional + parameters passed verbatim to the underlying interpolation function Returns ------- - DataArray + interpolated: DataArray + Filled in DataArray. See also -------- @@ -2070,6 +2095,7 @@ def interpolate_na( method=method, limit=limit, use_coordinate=use_coordinate, + max_gap=max_gap, **kwargs, ) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index de713b830f2..913842c4eba 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3908,42 +3908,65 @@ def interpolate_na( method: str = "linear", limit: int = None, use_coordinate: Union[bool, Hashable] = True, + max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None, **kwargs: Any, ) -> "Dataset": - """Interpolate values according to different methods. + """Fill in NaNs by interpolating according to different methods. Parameters ---------- - dim : Hashable + dim : str Specifies the dimension along which to interpolate. - method : {'linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'polynomial', 'barycentric', 'krog', 'pchip', - 'spline'}, optional + method : str, optional String indicating which method to use for interpolation: - 'linear': linear interpolation (Default). Additional keyword - arguments are passed to ``numpy.interp`` - - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', - 'polynomial': are passed to ``scipy.interpolate.interp1d``. If - method=='polynomial', the ``order`` keyword argument must also be + arguments are passed to :py:func:`numpy.interp` + - 'nearest', 'zero', 'slinear', 'quadratic', 'cubic', 'polynomial': + are passed to :py:func:`scipy.interpolate.interp1d`. If + ``method='polynomial'``, the ``order`` keyword argument must also be provided. - - 'barycentric', 'krog', 'pchip', 'spline': use their respective - ``scipy.interpolate`` classes. - use_coordinate : boolean or str, default True + - 'barycentric', 'krog', 'pchip', 'spline', 'akima': use their + respective :py:class:`scipy.interpolate` classes. + use_coordinate : bool, str, default True Specifies which index to use as the x values in the interpolation formulated as `y = f(x)`. If False, values are treated as if - eqaully-spaced along `dim`. If True, the IndexVariable `dim` is - used. If use_coordinate is a string, it specifies the name of a + eqaully-spaced along ``dim``. If True, the IndexVariable `dim` is + used. If ``use_coordinate`` is a string, it specifies the name of a coordinate variariable to use as the index. limit : int, default None Maximum number of consecutive NaNs to fill. Must be greater than 0 - or None for no limit. - kwargs : any - parameters passed verbatim to the underlying interplation function + or None for no limit. This filling is done regardless of the size of + the gap in the data. To only interpolate over gaps less than a given length, + see ``max_gap``. + max_gap: int, float, str, pandas.Timedelta, numpy.timedelta64, default None. + Maximum size of gap, a continuous sequence of NaNs, that will be filled. + Use None for no limit. When interpolating along a datetime64 dimension + and ``use_coordinate=True``, ``max_gap`` can be one of the following: + + - a string that is valid input for pandas.to_timedelta + - a :py:class:`numpy.timedelta64` object + - a :py:class:`pandas.Timedelta` object + Otherwise, ``max_gap`` must be an int or a float. Use of ``max_gap`` with unlabeled + dimensions has not been implemented yet. Gap length is defined as the difference + between coordinate values at the first data point after a gap and the last value + before a gap. For gaps at the beginning (end), gap length is defined as the difference + between coordinate values at the first (last) valid data point and the first (last) NaN. + For example, consider:: + + + array([nan, nan, nan, 1., nan, nan, 4., nan, nan]) + Coordinates: + * x (x) int64 0 1 2 3 4 5 6 7 8 + + The gap lengths are 3-0 = 3; 6-3 = 3; and 8-6 = 2 respectively + kwargs : dict, optional + parameters passed verbatim to the underlying interpolation function Returns ------- - Dataset + interpolated: Dataset + Filled in Dataset. See also -------- @@ -3959,6 +3982,7 @@ def interpolate_na( method=method, limit=limit, use_coordinate=use_coordinate, + max_gap=max_gap, **kwargs, ) return new diff --git a/xarray/core/missing.py b/xarray/core/missing.py index 77dde66484e..117fcaf8f81 100644 --- a/xarray/core/missing.py +++ b/xarray/core/missing.py @@ -1,18 +1,46 @@ import warnings from functools import partial -from typing import Any, Callable, Dict, Sequence +from numbers import Number +from typing import Any, Callable, Dict, Hashable, Sequence, Union import numpy as np import pandas as pd from . import utils -from .common import _contains_datetime_like_objects +from .common import _contains_datetime_like_objects, ones_like from .computation import apply_ufunc from .duck_array_ops import dask_array_type from .utils import OrderedSet, is_scalar from .variable import Variable, broadcast_variables +def _get_nan_block_lengths(obj, dim: Hashable, index: Variable): + """ + Return an object where each NaN element in 'obj' is replaced by the + length of the gap the element is in. + """ + + # make variable so that we get broadcasting for free + index = Variable([dim], index) + + # algorithm from https://github.com/pydata/xarray/pull/3302#discussion_r324707072 + arange = ones_like(obj) * index + valid = obj.notnull() + valid_arange = arange.where(valid) + cumulative_nans = valid_arange.ffill(dim=dim).fillna(index[0]) + + nan_block_lengths = ( + cumulative_nans.diff(dim=dim, label="upper") + .reindex({dim: obj[dim]}) + .where(valid) + .bfill(dim=dim) + .where(~valid, 0) + .fillna(index[-1] - valid_arange.max()) + ) + + return nan_block_lengths + + class BaseInterpolator: """Generic interpolator class for normalizing interpolation methods """ @@ -178,7 +206,7 @@ def _apply_over_vars_with_dim(func, self, dim=None, **kwargs): return ds -def get_clean_interp_index(arr, dim, use_coordinate=True): +def get_clean_interp_index(arr, dim: Hashable, use_coordinate: Union[str, bool] = True): """get index to use for x values in interpolation. If use_coordinate is True, the coordinate that shares the name of the @@ -195,23 +223,33 @@ def get_clean_interp_index(arr, dim, use_coordinate=True): index = arr.coords[use_coordinate] if index.ndim != 1: raise ValueError( - "Coordinates used for interpolation must be 1D, " - "%s is %dD." % (use_coordinate, index.ndim) + f"Coordinates used for interpolation must be 1D, " + f"{use_coordinate} is {index.ndim}D." ) + index = index.to_index() + + # TODO: index.name is None for multiindexes + # set name for nice error messages below + if isinstance(index, pd.MultiIndex): + index.name = dim + + if not index.is_monotonic: + raise ValueError(f"Index {index.name!r} must be monotonically increasing") + + if not index.is_unique: + raise ValueError(f"Index {index.name!r} has duplicate values") # raise if index cannot be cast to a float (e.g. MultiIndex) try: index = index.values.astype(np.float64) except (TypeError, ValueError): # pandas raises a TypeError - # xarray/nuppy raise a ValueError + # xarray/numpy raise a ValueError raise TypeError( - "Index must be castable to float64 to support" - "interpolation, got: %s" % type(index) + f"Index {index.name!r} must be castable to float64 to support " + f"interpolation, got {type(index).__name__}." ) - # check index sorting now so we can skip it later - if not (np.diff(index) > 0).all(): - raise ValueError("Index must be monotonicly increasing") + else: axis = arr.get_axis_num(dim) index = np.arange(arr.shape[axis], dtype=np.float64) @@ -220,7 +258,13 @@ def get_clean_interp_index(arr, dim, use_coordinate=True): def interp_na( - self, dim=None, use_coordinate=True, method="linear", limit=None, **kwargs + self, + dim: Hashable = None, + use_coordinate: Union[bool, str] = True, + method: str = "linear", + limit: int = None, + max_gap: Union[int, float, str, pd.Timedelta, np.timedelta64] = None, + **kwargs, ): """Interpolate values according to different methods. """ @@ -230,6 +274,40 @@ def interp_na( if limit is not None: valids = _get_valid_fill_mask(self, dim, limit) + if max_gap is not None: + max_type = type(max_gap).__name__ + if not is_scalar(max_gap): + raise ValueError("max_gap must be a scalar.") + + if ( + dim in self.indexes + and isinstance(self.indexes[dim], pd.DatetimeIndex) + and use_coordinate + ): + if not isinstance(max_gap, (np.timedelta64, pd.Timedelta, str)): + raise TypeError( + f"Underlying index is DatetimeIndex. Expected max_gap of type str, pandas.Timedelta or numpy.timedelta64 but received {max_type}" + ) + + if isinstance(max_gap, str): + try: + max_gap = pd.to_timedelta(max_gap) + except ValueError: + raise ValueError( + f"Could not convert {max_gap!r} to timedelta64 using pandas.to_timedelta" + ) + + if isinstance(max_gap, pd.Timedelta): + max_gap = np.timedelta64(max_gap.value, "ns") + + max_gap = np.timedelta64(max_gap, "ns").astype(np.float64) + + if not use_coordinate: + if not isinstance(max_gap, (Number, np.number)): + raise TypeError( + f"Expected integer or floating point max_gap since use_coordinate=False. Received {max_type}." + ) + # method index = get_clean_interp_index(self, dim, use_coordinate=use_coordinate) interp_class, kwargs = _get_interpolator(method, **kwargs) @@ -253,6 +331,14 @@ def interp_na( if limit is not None: arr = arr.where(valids) + if max_gap is not None: + if dim not in self.coords: + raise NotImplementedError( + "max_gap not implemented for unlabeled coordinates yet." + ) + nan_block_lengths = _get_nan_block_lengths(self, dim, index) + arr = arr.where(nan_block_lengths <= max_gap) + return arr diff --git a/xarray/tests/test_missing.py b/xarray/tests/test_missing.py index cfce5d6f645..0b410383a34 100644 --- a/xarray/tests/test_missing.py +++ b/xarray/tests/test_missing.py @@ -5,7 +5,13 @@ import pytest import xarray as xr -from xarray.core.missing import NumpyInterpolator, ScipyInterpolator, SplineInterpolator +from xarray.core.missing import ( + NumpyInterpolator, + ScipyInterpolator, + SplineInterpolator, + get_clean_interp_index, + _get_nan_block_lengths, +) from xarray.core.pycompat import dask_array_type from xarray.tests import ( assert_array_equal, @@ -153,7 +159,7 @@ def test_interpolate_pd_compat_polynomial(): def test_interpolate_unsorted_index_raises(): vals = np.array([1, 2, 3], dtype=np.float64) expected = xr.DataArray(vals, dims="x", coords={"x": [2, 1, 3]}) - with raises_regex(ValueError, "Index must be monotonicly increasing"): + with raises_regex(ValueError, "Index 'x' must be monotonically increasing"): expected.interpolate_na(dim="x", method="index") @@ -169,12 +175,19 @@ def test_interpolate_invalid_interpolator_raises(): da.interpolate_na(dim="x", method="foo") +def test_interpolate_duplicate_values_raises(): + data = np.random.randn(2, 3) + da = xr.DataArray(data, coords=[("x", ["a", "a"]), ("y", [0, 1, 2])]) + with raises_regex(ValueError, "Index 'x' has duplicate values"): + da.interpolate_na(dim="x", method="foo") + + def test_interpolate_multiindex_raises(): data = np.random.randn(2, 3) data[1, 1] = np.nan da = xr.DataArray(data, coords=[("x", ["a", "b"]), ("y", [0, 1, 2])]) das = da.stack(z=("x", "y")) - with raises_regex(TypeError, "Index must be castable to float64"): + with raises_regex(TypeError, "Index 'z' must be castable to float64"): das.interpolate_na(dim="z") @@ -439,3 +452,114 @@ def test_ffill_dataset(ds): @requires_bottleneck def test_bfill_dataset(ds): ds.ffill(dim="time") + + +@requires_bottleneck +@pytest.mark.parametrize( + "y, lengths", + [ + [np.arange(9), [[3, 3, 3, 0, 3, 3, 0, 2, 2]]], + [np.arange(9) * 3, [[9, 9, 9, 0, 9, 9, 0, 6, 6]]], + [[0, 2, 5, 6, 7, 8, 10, 12, 14], [[6, 6, 6, 0, 4, 4, 0, 4, 4]]], + ], +) +def test_interpolate_na_nan_block_lengths(y, lengths): + arr = [[np.nan, np.nan, np.nan, 1, np.nan, np.nan, 4, np.nan, np.nan]] + da = xr.DataArray(arr * 2, dims=["x", "y"], coords={"x": [0, 1], "y": y}) + index = get_clean_interp_index(da, dim="y", use_coordinate=True) + actual = _get_nan_block_lengths(da, dim="y", index=index) + expected = da.copy(data=lengths * 2) + assert_equal(actual, expected) + + +@pytest.fixture +def da_time(): + return xr.DataArray( + [np.nan, 1, 2, np.nan, np.nan, 5, np.nan, np.nan, np.nan, np.nan, 10], + dims=["t"], + ) + + +def test_interpolate_na_max_gap_errors(da_time): + with raises_regex( + NotImplementedError, "max_gap not implemented for unlabeled coordinates" + ): + da_time.interpolate_na("t", max_gap=1) + + with raises_regex(ValueError, "max_gap must be a scalar."): + da_time.interpolate_na("t", max_gap=(1,)) + + da_time["t"] = pd.date_range("2001-01-01", freq="H", periods=11) + with raises_regex(TypeError, "Underlying index is"): + da_time.interpolate_na("t", max_gap=1) + + with raises_regex(TypeError, "Expected integer or floating point"): + da_time.interpolate_na("t", max_gap="1H", use_coordinate=False) + + with raises_regex(ValueError, "Could not convert 'huh' to timedelta64"): + da_time.interpolate_na("t", max_gap="huh") + + +@requires_bottleneck +@pytest.mark.parametrize( + "time_range_func", + [pd.date_range, pytest.param(xr.cftime_range, marks=pytest.mark.xfail)], +) +@pytest.mark.parametrize("transform", [lambda x: x, lambda x: x.to_dataset(name="a")]) +@pytest.mark.parametrize( + "max_gap", ["3H", np.timedelta64(3, "h"), pd.to_timedelta("3H")] +) +def test_interpolate_na_max_gap_time_specifier( + da_time, max_gap, transform, time_range_func +): + da_time["t"] = time_range_func("2001-01-01", freq="H", periods=11) + expected = transform( + da_time.copy(data=[np.nan, 1, 2, 3, 4, 5, np.nan, np.nan, np.nan, np.nan, 10]) + ) + actual = transform(da_time).interpolate_na("t", max_gap=max_gap) + assert_equal(actual, expected) + + +@requires_bottleneck +@pytest.mark.parametrize( + "coords", + [ + pytest.param(None, marks=pytest.mark.xfail()), + {"x": np.arange(4), "y": np.arange(11)}, + ], +) +def test_interpolate_na_2d(coords): + da = xr.DataArray( + [ + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, np.nan, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, np.nan, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + ], + dims=["x", "y"], + coords=coords, + ) + + actual = da.interpolate_na("y", max_gap=2) + expected_y = da.copy( + data=[ + [1, 2, 3, 4, 5, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, np.nan, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, np.nan, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, 5, 6, 7, np.nan, np.nan, np.nan, 11], + ] + ) + assert_equal(actual, expected_y) + + actual = da.interpolate_na("x", max_gap=3) + expected_x = xr.DataArray( + [ + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + [1, 2, 3, 4, np.nan, 6, 7, np.nan, np.nan, np.nan, 11], + ], + dims=["x", "y"], + coords=coords, + ) + assert_equal(actual, expected_x) From aa876cfd6b3b97ee5028d089ec741d057e3ed688 Mon Sep 17 00:00:00 2001 From: crusaderky Date: Fri, 15 Nov 2019 17:43:53 +0000 Subject: [PATCH 19/22] Leave empty slot when not using accessors --- xarray/core/dataarray.py | 5 ++--- xarray/core/dataset.py | 6 ++---- xarray/core/extensions.py | 13 +++++++++---- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 7ce775b49cd..b27a61d530b 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -249,14 +249,14 @@ class DataArray(AbstractArray, DataWithCoords): Dictionary for holding arbitrary metadata. """ - _accessors: Optional[Dict[str, Any]] # noqa + _cache: Dict[str, Any] _coords: Dict[Any, Variable] _indexes: Optional[Dict[Hashable, pd.Index]] _name: Optional[Hashable] _variable: Variable __slots__ = ( - "_accessors", + "_cache", "_coords", "_file_obj", "_indexes", @@ -373,7 +373,6 @@ def __init__( assert isinstance(coords, dict) self._coords = coords self._name = name - self._accessors = None # TODO(shoyer): document this argument, once it becomes part of the # public interface. diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 913842c4eba..ea310dd164b 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -419,8 +419,8 @@ class Dataset(Mapping, ImplementsDatasetReduce, DataWithCoords): coordinates used for label based indexing. """ - _accessors: Optional[Dict[str, Any]] _attrs: Optional[Dict[Hashable, Any]] + _cache: Dict[str, Any] _coord_names: Set[Hashable] _dims: Dict[Hashable, int] _encoding: Optional[Dict[Hashable, Any]] @@ -428,8 +428,8 @@ class Dataset(Mapping, ImplementsDatasetReduce, DataWithCoords): _variables: Dict[Hashable, Variable] __slots__ = ( - "_accessors", "_attrs", + "_cache", "_coord_names", "_dims", "_encoding", @@ -535,7 +535,6 @@ def __init__( data_vars, coords, compat=compat ) - self._accessors = None self._attrs = dict(attrs) if attrs is not None else None self._file_obj = None self._encoding = None @@ -870,7 +869,6 @@ def _construct_direct( obj._attrs = attrs obj._file_obj = file_obj obj._encoding = encoding - obj._accessors = None return obj @classmethod diff --git a/xarray/core/extensions.py b/xarray/core/extensions.py index f473eaa497d..79abbccea39 100644 --- a/xarray/core/extensions.py +++ b/xarray/core/extensions.py @@ -20,10 +20,15 @@ def __get__(self, obj, cls): # we're accessing the attribute of the class, i.e., Dataset.geo return self._accessor + # Use the same dict as @pandas.util.cache_readonly. + # It must be explicitly declared in obj.__slots__. try: - return obj._accessors[self._name] - except TypeError: - obj._accessors = {} + cache = obj._cache + except AttributeError: + cache = obj._cache = {} + + try: + return cache[self._name] except KeyError: pass @@ -35,7 +40,7 @@ def __get__(self, obj, cls): # something else (GH933): raise RuntimeError("error initializing %r accessor." % self._name) - obj._accessors[self._name] = accessor_obj + cache[self._name] = accessor_obj return accessor_obj From 68b004fe5033f4a991d152190864ee1180845806 Mon Sep 17 00:00:00 2001 From: Mathias Hauser Date: Fri, 15 Nov 2019 20:49:29 +0100 Subject: [PATCH 20/22] ensure rename does not change index type (#3532) * ensure rename does not change index type * test requires cftime * test orig.indexes[time].name is conserved * use index.rename() --- doc/whats-new.rst | 4 +++ xarray/core/dataset.py | 2 +- xarray/tests/test_dataset.py | 49 ++++++++++++++++++++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 053f785bc05..3c3bf127a3f 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -83,6 +83,10 @@ New Features Bug fixes ~~~~~~~~~ +- Ensure an index of type ``CFTimeIndex`` is not converted to a ``DatetimeIndex`` when + calling :py:meth:`Dataset.rename` (also :py:meth:`Dataset.rename_dims` + and :py:meth:`xr.Dataset.rename_vars`). By `Mathias Hauser `_ + (:issue:`3522`). - Fix a bug in `set_index` in case that an existing dimension becomes a level variable of MultiIndex. (:pull:`3520`) By `Keisuke Fujii `_. - Harmonize `_FillValue`, `missing_value` during encoding and decoding steps. (:pull:`3502`) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index ea310dd164b..3a83b477681 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -2665,7 +2665,7 @@ def _rename_indexes(self, name_dict, dims_set): verify_integrity=False, ) else: - index = pd.Index(v, name=new_name) + index = v.rename(new_name) indexes[new_name] = index return indexes diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 67d3b3198dc..780843f2e61 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd import pytest +from pandas.core.indexes.datetimes import DatetimeIndex import xarray as xr from xarray import ( @@ -22,6 +23,7 @@ open_dataset, set_options, ) +from xarray.coding.cftimeindex import CFTimeIndex from xarray.core import dtypes, indexing, utils from xarray.core.common import duck_array_ops, full_like from xarray.core.npcompat import IS_NEP18_ACTIVE @@ -2458,6 +2460,53 @@ def test_rename_vars(self): with pytest.raises(ValueError): original.rename_vars(names_dict_bad) + @requires_cftime + def test_rename_does_not_change_CFTimeIndex_type(self): + # make sure CFTimeIndex is not converted to DatetimeIndex #3522 + + time = xr.cftime_range(start="2000", periods=6, freq="2MS", calendar="noleap") + orig = Dataset(coords={"time": time}) + + renamed = orig.rename(time="time_new") + assert "time_new" in renamed.indexes + assert isinstance(renamed.indexes["time_new"], CFTimeIndex) + assert renamed.indexes["time_new"].name == "time_new" + + # check original has not changed + assert "time" in orig.indexes + assert isinstance(orig.indexes["time"], CFTimeIndex) + assert orig.indexes["time"].name == "time" + + # note: rename_dims(time="time_new") drops "ds.indexes" + renamed = orig.rename_dims() + assert isinstance(renamed.indexes["time"], CFTimeIndex) + + renamed = orig.rename_vars() + assert isinstance(renamed.indexes["time"], CFTimeIndex) + + def test_rename_does_not_change_DatetimeIndex_type(self): + # make sure DatetimeIndex is conderved on rename + + time = pd.date_range(start="2000", periods=6, freq="2MS") + orig = Dataset(coords={"time": time}) + + renamed = orig.rename(time="time_new") + assert "time_new" in renamed.indexes + assert isinstance(renamed.indexes["time_new"], DatetimeIndex) + assert renamed.indexes["time_new"].name == "time_new" + + # check original has not changed + assert "time" in orig.indexes + assert isinstance(orig.indexes["time"], DatetimeIndex) + assert orig.indexes["time"].name == "time" + + # note: rename_dims(time="time_new") drops "ds.indexes" + renamed = orig.rename_dims() + assert isinstance(renamed.indexes["time"], DatetimeIndex) + + renamed = orig.rename_vars() + assert isinstance(renamed.indexes["time"], DatetimeIndex) + def test_swap_dims(self): original = Dataset({"x": [1, 2, 3], "y": ("x", list("abc")), "z": 42}) expected = Dataset({"z": 42}, {"x": ("y", [1, 2, 3]), "y": list("abc")}) From 52d48450f6291716a90f4f7e93e15847942e0da0 Mon Sep 17 00:00:00 2001 From: keewis Date: Fri, 15 Nov 2019 20:58:01 +0100 Subject: [PATCH 21/22] Add DatasetGroupBy.quantile (#3527) * move the implementation of DataArrayGroupBy.quantile to GroupBy * add tests for DatasetGroupBy * update whats-new.rst * move the item in whats-new.rst into New Features * don't drop scalar quantile coords --- doc/whats-new.rst | 2 + xarray/core/groupby.py | 107 +++++++++++++------------- xarray/tests/test_groupby.py | 143 +++++++++++++++++++++++++++++++---- 3 files changed, 184 insertions(+), 68 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 3c3bf127a3f..c835fbeff45 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -80,6 +80,8 @@ New Features invoked. (:issue:`3378`, :pull:`3446`, :pull:`3515`) By `Deepak Cherian `_ and `Guido Imperiale `_. +- Add the documented-but-missing :py:meth:`xarray.core.groupby.DatasetGroupBy.quantile`. + (:issue:`3525`, :pull:`3527`). By `Justus Magin `_. Bug fixes ~~~~~~~~~ diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index c73ee3cf7c5..38ecc04534a 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -557,6 +557,59 @@ def fillna(self, value): out = ops.fillna(self, value) return out + def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): + """Compute the qth quantile over each array in the groups and + concatenate them together into a new array. + + Parameters + ---------- + q : float in range of [0,1] (or sequence of floats) + Quantile to compute, which must be between 0 and 1 + inclusive. + dim : `...`, str or sequence of str, optional + Dimension(s) over which to apply quantile. + Defaults to the grouped dimension. + interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} + This optional parameter specifies the interpolation method to + use when the desired quantile lies between two data points + ``i < j``: + * linear: ``i + (j - i) * fraction``, where ``fraction`` is + the fractional part of the index surrounded by ``i`` and + ``j``. + * lower: ``i``. + * higher: ``j``. + * nearest: ``i`` or ``j``, whichever is nearest. + * midpoint: ``(i + j) / 2``. + + Returns + ------- + quantiles : Variable + If `q` is a single quantile, then the result is a + scalar. If multiple percentiles are given, first axis of + the result corresponds to the quantile. In either case a + quantile dimension is added to the return array. The other + dimensions are the dimensions that remain after the + reduction of the array. + + See Also + -------- + numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile, + DataArray.quantile + """ + if dim is None: + dim = self._group_dim + + out = self.map( + self._obj.__class__.quantile, + shortcut=False, + q=q, + dim=dim, + interpolation=interpolation, + keep_attrs=keep_attrs, + ) + + return out + def where(self, cond, other=dtypes.NA): """Return elements from `self` or `other` depending on `cond`. @@ -737,60 +790,6 @@ def _combine(self, applied, restore_coord_dims=False, shortcut=False): combined = self._maybe_unstack(combined) return combined - def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): - """Compute the qth quantile over each array in the groups and - concatenate them together into a new array. - - Parameters - ---------- - q : float in range of [0,1] (or sequence of floats) - Quantile to compute, which must be between 0 and 1 - inclusive. - dim : `...`, str or sequence of str, optional - Dimension(s) over which to apply quantile. - Defaults to the grouped dimension. - interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'} - This optional parameter specifies the interpolation method to - use when the desired quantile lies between two data points - ``i < j``: - * linear: ``i + (j - i) * fraction``, where ``fraction`` is - the fractional part of the index surrounded by ``i`` and - ``j``. - * lower: ``i``. - * higher: ``j``. - * nearest: ``i`` or ``j``, whichever is nearest. - * midpoint: ``(i + j) / 2``. - - Returns - ------- - quantiles : Variable - If `q` is a single quantile, then the result - is a scalar. If multiple percentiles are given, first axis of - the result corresponds to the quantile and a quantile dimension - is added to the return array. The other dimensions are the - dimensions that remain after the reduction of the array. - - See Also - -------- - numpy.nanpercentile, pandas.Series.quantile, Dataset.quantile, - DataArray.quantile - """ - if dim is None: - dim = self._group_dim - - out = self.map( - self._obj.__class__.quantile, - shortcut=False, - q=q, - dim=dim, - interpolation=interpolation, - keep_attrs=keep_attrs, - ) - - if np.asarray(q, dtype=np.float64).ndim == 0: - out = out.drop_vars("quantile") - return out - def reduce( self, func, dim=None, axis=None, keep_attrs=None, shortcut=True, **kwargs ): diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 581affa3471..97bd31ae050 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -137,42 +137,58 @@ def test_da_groupby_empty(): def test_da_groupby_quantile(): - array = xr.DataArray([1, 2, 3, 4, 5, 6], [("x", [1, 1, 1, 2, 2, 2])]) + array = xr.DataArray( + data=[1, 2, 3, 4, 5, 6], coords={"x": [1, 1, 1, 2, 2, 2]}, dims="x" + ) # Scalar quantile - expected = xr.DataArray([2, 5], [("x", [1, 2])]) + expected = xr.DataArray( + data=[2, 5], coords={"x": [1, 2], "quantile": 0.5}, dims="x" + ) actual = array.groupby("x").quantile(0.5) assert_identical(expected, actual) # Vector quantile - expected = xr.DataArray([[1, 3], [4, 6]], [("x", [1, 2]), ("quantile", [0, 1])]) + expected = xr.DataArray( + data=[[1, 3], [4, 6]], + coords={"x": [1, 2], "quantile": [0, 1]}, + dims=("x", "quantile"), + ) actual = array.groupby("x").quantile([0, 1]) assert_identical(expected, actual) # Multiple dimensions array = xr.DataArray( - [[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]], - [("x", [1, 1, 1, 2, 2]), ("y", [0, 0, 1])], + data=[[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]], + coords={"x": [1, 1, 1, 2, 2], "y": [0, 0, 1]}, + dims=("x", "y"), ) actual_x = array.groupby("x").quantile(0, dim=...) - expected_x = xr.DataArray([1, 4], [("x", [1, 2])]) + expected_x = xr.DataArray( + data=[1, 4], coords={"x": [1, 2], "quantile": 0}, dims="x" + ) assert_identical(expected_x, actual_x) actual_y = array.groupby("y").quantile(0, dim=...) - expected_y = xr.DataArray([1, 22], [("y", [0, 1])]) + expected_y = xr.DataArray( + data=[1, 22], coords={"y": [0, 1], "quantile": 0}, dims="y" + ) assert_identical(expected_y, actual_y) actual_xx = array.groupby("x").quantile(0) expected_xx = xr.DataArray( - [[1, 11, 22], [4, 15, 24]], [("x", [1, 2]), ("y", [0, 0, 1])] + data=[[1, 11, 22], [4, 15, 24]], + coords={"x": [1, 2], "y": [0, 0, 1], "quantile": 0}, + dims=("x", "y"), ) assert_identical(expected_xx, actual_xx) actual_yy = array.groupby("y").quantile(0) expected_yy = xr.DataArray( - [[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]], - [("x", [1, 1, 1, 2, 2]), ("y", [0, 1])], + data=[[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]], + coords={"x": [1, 1, 1, 2, 2], "y": [0, 1], "quantile": 0}, + dims=("x", "y"), ) assert_identical(expected_yy, actual_yy) @@ -180,14 +196,14 @@ def test_da_groupby_quantile(): x = [0, 1] foo = xr.DataArray( np.reshape(np.arange(365 * 2), (365, 2)), - coords=dict(time=times, x=x), + coords={"time": times, "x": x}, dims=("time", "x"), ) g = foo.groupby(foo.time.dt.month) actual = g.quantile(0, dim=...) expected = xr.DataArray( - [ + data=[ 0.0, 62.0, 120.0, @@ -201,12 +217,111 @@ def test_da_groupby_quantile(): 610.0, 670.0, ], - [("month", np.arange(1, 13))], + coords={"month": np.arange(1, 13), "quantile": 0}, + dims="month", ) assert_identical(expected, actual) actual = g.quantile(0, dim="time")[:2] - expected = xr.DataArray([[0.0, 1], [62.0, 63]], [("month", [1, 2]), ("x", [0, 1])]) + expected = xr.DataArray( + data=[[0.0, 1], [62.0, 63]], + coords={"month": [1, 2], "x": [0, 1], "quantile": 0}, + dims=("month", "x"), + ) + assert_identical(expected, actual) + + +def test_ds_groupby_quantile(): + ds = xr.Dataset( + data_vars={"a": ("x", [1, 2, 3, 4, 5, 6])}, coords={"x": [1, 1, 1, 2, 2, 2]} + ) + + # Scalar quantile + expected = xr.Dataset( + data_vars={"a": ("x", [2, 5])}, coords={"quantile": 0.5, "x": [1, 2]} + ) + actual = ds.groupby("x").quantile(0.5) + assert_identical(expected, actual) + + # Vector quantile + expected = xr.Dataset( + data_vars={"a": (("x", "quantile"), [[1, 3], [4, 6]])}, + coords={"x": [1, 2], "quantile": [0, 1]}, + ) + actual = ds.groupby("x").quantile([0, 1]) + assert_identical(expected, actual) + + # Multiple dimensions + ds = xr.Dataset( + data_vars={ + "a": ( + ("x", "y"), + [[1, 11, 26], [2, 12, 22], [3, 13, 23], [4, 16, 24], [5, 15, 25]], + ) + }, + coords={"x": [1, 1, 1, 2, 2], "y": [0, 0, 1]}, + ) + + actual_x = ds.groupby("x").quantile(0, dim=...) + expected_x = xr.Dataset({"a": ("x", [1, 4])}, coords={"x": [1, 2], "quantile": 0}) + assert_identical(expected_x, actual_x) + + actual_y = ds.groupby("y").quantile(0, dim=...) + expected_y = xr.Dataset({"a": ("y", [1, 22])}, coords={"y": [0, 1], "quantile": 0}) + assert_identical(expected_y, actual_y) + + actual_xx = ds.groupby("x").quantile(0) + expected_xx = xr.Dataset( + {"a": (("x", "y"), [[1, 11, 22], [4, 15, 24]])}, + coords={"x": [1, 2], "y": [0, 0, 1], "quantile": 0}, + ) + assert_identical(expected_xx, actual_xx) + + actual_yy = ds.groupby("y").quantile(0) + expected_yy = xr.Dataset( + {"a": (("x", "y"), [[1, 26], [2, 22], [3, 23], [4, 24], [5, 25]])}, + coords={"x": [1, 1, 1, 2, 2], "y": [0, 1], "quantile": 0}, + ).transpose() + assert_identical(expected_yy, actual_yy) + + times = pd.date_range("2000-01-01", periods=365) + x = [0, 1] + foo = xr.Dataset( + {"a": (("time", "x"), np.reshape(np.arange(365 * 2), (365, 2)))}, + coords=dict(time=times, x=x), + ) + g = foo.groupby(foo.time.dt.month) + + actual = g.quantile(0, dim=...) + expected = xr.Dataset( + { + "a": ( + "month", + [ + 0.0, + 62.0, + 120.0, + 182.0, + 242.0, + 304.0, + 364.0, + 426.0, + 488.0, + 548.0, + 610.0, + 670.0, + ], + ) + }, + coords={"month": np.arange(1, 13), "quantile": 0}, + ) + assert_identical(expected, actual) + + actual = g.quantile(0, dim="time").isel(month=slice(None, 2)) + expected = xr.Dataset( + data_vars={"a": (("month", "x"), [[0.0, 1], [62.0, 63]])}, + coords={"month": [1, 2], "x": [0, 1], "quantile": 0}, + ) assert_identical(expected, actual) From 56c16e4bf45a3771fd9acba76d802c0199c14519 Mon Sep 17 00:00:00 2001 From: Keisuke Fujii Date: Sat, 16 Nov 2019 23:36:43 +0900 Subject: [PATCH 22/22] Added fill_value for unstack (#3541) * Added fill_value for unstack * remove sparse option and fix unintended changes * a bug fix * using assert_equal * assert_equals -> assert_equal --- doc/whats-new.rst | 3 +++ xarray/core/dataarray.py | 7 +++++-- xarray/core/dataset.py | 13 +++++++++---- xarray/tests/test_dataset.py | 17 +++++++++++++++++ 4 files changed, 34 insertions(+), 6 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c835fbeff45..6bf495713fe 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -39,6 +39,9 @@ Breaking changes New Features ~~~~~~~~~~~~ +- Added the ``fill_value`` option to :py:meth:`~xarray.DataArray.unstack` and + :py:meth:`~xarray.Dataset.unstack` (:issue:`3518`). + By `Keisuke Fujii `_. - Added the ``max_gap`` kwarg to :py:meth:`~xarray.DataArray.interpolate_na` and :py:meth:`~xarray.Dataset.interpolate_na`. This controls the maximum size of the data gap that will be filled by interpolation. By `Deepak Cherian `_. diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index b27a61d530b..23342fc5e0d 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -1726,7 +1726,9 @@ def stack( return self._from_temp_dataset(ds) def unstack( - self, dim: Union[Hashable, Sequence[Hashable], None] = None + self, + dim: Union[Hashable, Sequence[Hashable], None] = None, + fill_value: Any = dtypes.NA, ) -> "DataArray": """ Unstack existing dimensions corresponding to MultiIndexes into @@ -1739,6 +1741,7 @@ def unstack( dim : hashable or sequence of hashable, optional Dimension(s) over which to unstack. By default unstacks all MultiIndexes. + fill_value: value to be filled. By default, np.nan Returns ------- @@ -1770,7 +1773,7 @@ def unstack( -------- DataArray.stack """ - ds = self._to_temp_dataset().unstack(dim) + ds = self._to_temp_dataset().unstack(dim, fill_value) return self._from_temp_dataset(ds) def to_unstacked_dataset(self, dim, level=0): diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 3a83b477681..371e0d6bf26 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3333,7 +3333,7 @@ def ensure_stackable(val): return data_array - def _unstack_once(self, dim: Hashable) -> "Dataset": + def _unstack_once(self, dim: Hashable, fill_value) -> "Dataset": index = self.get_index(dim) index = index.remove_unused_levels() full_idx = pd.MultiIndex.from_product(index.levels, names=index.names) @@ -3342,7 +3342,7 @@ def _unstack_once(self, dim: Hashable) -> "Dataset": if index.equals(full_idx): obj = self else: - obj = self.reindex({dim: full_idx}, copy=False) + obj = self.reindex({dim: full_idx}, copy=False, fill_value=fill_value) new_dim_names = index.names new_dim_sizes = [lev.size for lev in index.levels] @@ -3368,7 +3368,11 @@ def _unstack_once(self, dim: Hashable) -> "Dataset": variables, coord_names=coord_names, indexes=indexes ) - def unstack(self, dim: Union[Hashable, Iterable[Hashable]] = None) -> "Dataset": + def unstack( + self, + dim: Union[Hashable, Iterable[Hashable]] = None, + fill_value: Any = dtypes.NA, + ) -> "Dataset": """ Unstack existing dimensions corresponding to MultiIndexes into multiple new dimensions. @@ -3380,6 +3384,7 @@ def unstack(self, dim: Union[Hashable, Iterable[Hashable]] = None) -> "Dataset": dim : Hashable or iterable of Hashable, optional Dimension(s) over which to unstack. By default unstacks all MultiIndexes. + fill_value: value to be filled. By default, np.nan Returns ------- @@ -3417,7 +3422,7 @@ def unstack(self, dim: Union[Hashable, Iterable[Hashable]] = None) -> "Dataset": result = self.copy(deep=False) for dim in dims: - result = result._unstack_once(dim) + result = result._unstack_once(dim, fill_value) return result def update(self, other: "CoercibleMapping", inplace: bool = None) -> "Dataset": diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 780843f2e61..be40ce7c6e8 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -2794,6 +2794,23 @@ def test_unstack_errors(self): with raises_regex(ValueError, "do not have a MultiIndex"): ds.unstack("x") + def test_unstack_fill_value(self): + ds = xr.Dataset( + {"var": (("x",), np.arange(6))}, + coords={"x": [0, 1, 2] * 2, "y": (("x",), ["a"] * 3 + ["b"] * 3)}, + ) + # make ds incomplete + ds = ds.isel(x=[0, 2, 3, 4]).set_index(index=["x", "y"]) + # test fill_value + actual = ds.unstack("index", fill_value=-1) + expected = ds.unstack("index").fillna(-1).astype(np.int) + assert actual["var"].dtype == np.int + assert_equal(actual, expected) + + actual = ds["var"].unstack("index", fill_value=-1) + expected = ds["var"].unstack("index").fillna(-1).astype(np.int) + assert actual.equals(expected) + def test_stack_unstack_fast(self): ds = Dataset( {