From af28c6b02fac08494f5d9ae2718d68a084d93949 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 5 Nov 2019 15:41:13 +0000 Subject: [PATCH 1/3] Optimize dask array equality checks. (#3453) * Optimize dask array equality checks. Dask arrays with the same graph have the same name. We can use this to quickly compare dask-backed variables without computing. Fixes #3068 and #3311 * better docstring * review suggestions. * add concat test * update whats new * Add identity check to lazy_array_equiv * pep8 * bugfix. --- doc/whats-new.rst | 3 + xarray/core/concat.py | 56 ++++++++++++------ xarray/core/duck_array_ops.py | 62 ++++++++++++++----- xarray/core/merge.py | 19 ++++-- xarray/core/variable.py | 14 +++-- xarray/tests/test_dask.py | 108 +++++++++++++++++++++++++++++++++- 6 files changed, 217 insertions(+), 45 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index c117382f23f..dcaab011e67 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -70,6 +70,9 @@ Bug fixes but cloudpickle isn't (:issue:`3401`) by `Rhys Doyle `_ - Fix grouping over variables with NaNs. (:issue:`2383`, :pull:`3406`). By `Deepak Cherian `_. +- Use dask names to compare dask objects prior to comparing values after computation. + (:issue:`3068`, :issue:`3311`, :issue:`3454`, :pull:`3453`). + By `Deepak Cherian `_. - Sync with cftime by removing `dayofwk=-1` for cftime>=1.0.4. By `Anderson Banihirwe `_. - Fix :py:meth:`xarray.core.groupby.DataArrayGroupBy.reduce` and diff --git a/xarray/core/concat.py b/xarray/core/concat.py index 0d19990bdd0..c26153eb0d8 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -2,6 +2,7 @@ from . import dtypes, utils from .alignment import align +from .duck_array_ops import lazy_array_equiv from .merge import _VALID_COMPAT, unique_variable from .variable import IndexVariable, Variable, as_variable from .variable import concat as concat_vars @@ -189,26 +190,43 @@ def process_subset_opt(opt, subset): # all nonindexes that are not the same in each dataset for k in getattr(datasets[0], subset): if k not in concat_over: - # Compare the variable of all datasets vs. the one - # of the first dataset. Perform the minimum amount of - # loads in order to avoid multiple loads from disk - # while keeping the RAM footprint low. - v_lhs = datasets[0].variables[k].load() - # We'll need to know later on if variables are equal. - computed = [] - for ds_rhs in datasets[1:]: - v_rhs = ds_rhs.variables[k].compute() - computed.append(v_rhs) - if not getattr(v_lhs, compat)(v_rhs): - concat_over.add(k) - equals[k] = False - # computed variables are not to be re-computed - # again in the future - for ds, v in zip(datasets[1:], computed): - ds.variables[k].data = v.data + equals[k] = None + variables = [ds.variables[k] for ds in datasets] + # first check without comparing values i.e. no computes + for var in variables[1:]: + equals[k] = getattr(variables[0], compat)( + var, equiv=lazy_array_equiv + ) + if equals[k] is not True: + # exit early if we know these are not equal or that + # equality cannot be determined i.e. one or all of + # the variables wraps a numpy array break - else: - equals[k] = True + + if equals[k] is False: + concat_over.add(k) + + elif equals[k] is None: + # Compare the variable of all datasets vs. the one + # of the first dataset. Perform the minimum amount of + # loads in order to avoid multiple loads from disk + # while keeping the RAM footprint low. + v_lhs = datasets[0].variables[k].load() + # We'll need to know later on if variables are equal. + computed = [] + for ds_rhs in datasets[1:]: + v_rhs = ds_rhs.variables[k].compute() + computed.append(v_rhs) + if not getattr(v_lhs, compat)(v_rhs): + concat_over.add(k) + equals[k] = False + # computed variables are not to be re-computed + # again in the future + for ds, v in zip(datasets[1:], computed): + ds.variables[k].data = v.data + break + else: + equals[k] = True elif opt == "all": concat_over.update( diff --git a/xarray/core/duck_array_ops.py b/xarray/core/duck_array_ops.py index d943788c434..71e79335c3d 100644 --- a/xarray/core/duck_array_ops.py +++ b/xarray/core/duck_array_ops.py @@ -174,14 +174,42 @@ def as_shared_dtype(scalars_or_arrays): return [x.astype(out_type, copy=False) for x in arrays] -def allclose_or_equiv(arr1, arr2, rtol=1e-5, atol=1e-8): - """Like np.allclose, but also allows values to be NaN in both arrays +def lazy_array_equiv(arr1, arr2): + """Like array_equal, but doesn't actually compare values. + Returns True when arr1, arr2 identical or their dask names are equal. + Returns False when shapes are not equal. + Returns None when equality cannot determined: one or both of arr1, arr2 are numpy arrays; + or their dask names are not equal """ + if arr1 is arr2: + return True arr1 = asarray(arr1) arr2 = asarray(arr2) if arr1.shape != arr2.shape: return False - return bool(isclose(arr1, arr2, rtol=rtol, atol=atol, equal_nan=True).all()) + if ( + dask_array + and isinstance(arr1, dask_array.Array) + and isinstance(arr2, dask_array.Array) + ): + # GH3068 + if arr1.name == arr2.name: + return True + else: + return None + return None + + +def allclose_or_equiv(arr1, arr2, rtol=1e-5, atol=1e-8): + """Like np.allclose, but also allows values to be NaN in both arrays + """ + arr1 = asarray(arr1) + arr2 = asarray(arr2) + lazy_equiv = lazy_array_equiv(arr1, arr2) + if lazy_equiv is None: + return bool(isclose(arr1, arr2, rtol=rtol, atol=atol, equal_nan=True).all()) + else: + return lazy_equiv def array_equiv(arr1, arr2): @@ -189,12 +217,14 @@ def array_equiv(arr1, arr2): """ arr1 = asarray(arr1) arr2 = asarray(arr2) - if arr1.shape != arr2.shape: - return False - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "In the future, 'NAT == x'") - flag_array = (arr1 == arr2) | (isnull(arr1) & isnull(arr2)) - return bool(flag_array.all()) + lazy_equiv = lazy_array_equiv(arr1, arr2) + if lazy_equiv is None: + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "In the future, 'NAT == x'") + flag_array = (arr1 == arr2) | (isnull(arr1) & isnull(arr2)) + return bool(flag_array.all()) + else: + return lazy_equiv def array_notnull_equiv(arr1, arr2): @@ -203,12 +233,14 @@ def array_notnull_equiv(arr1, arr2): """ arr1 = asarray(arr1) arr2 = asarray(arr2) - if arr1.shape != arr2.shape: - return False - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", "In the future, 'NAT == x'") - flag_array = (arr1 == arr2) | isnull(arr1) | isnull(arr2) - return bool(flag_array.all()) + lazy_equiv = lazy_array_equiv(arr1, arr2) + if lazy_equiv is None: + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "In the future, 'NAT == x'") + flag_array = (arr1 == arr2) | isnull(arr1) | isnull(arr2) + return bool(flag_array.all()) + else: + return lazy_equiv def count(data, axis=None): diff --git a/xarray/core/merge.py b/xarray/core/merge.py index 389ceb155f7..daf0c3b059f 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -19,6 +19,7 @@ from . import dtypes, pdcompat from .alignment import deep_align +from .duck_array_ops import lazy_array_equiv from .utils import Frozen, dict_equiv from .variable import Variable, as_variable, assert_unique_multiindex_level_names @@ -123,16 +124,24 @@ def unique_variable( combine_method = "fillna" if equals is None: - out = out.compute() + # first check without comparing values i.e. no computes for var in variables[1:]: - equals = getattr(out, compat)(var) - if not equals: + equals = getattr(out, compat)(var, equiv=lazy_array_equiv) + if equals is not True: break + if equals is None: + # now compare values with minimum number of computes + out = out.compute() + for var in variables[1:]: + equals = getattr(out, compat)(var) + if not equals: + break + if not equals: raise MergeError( - "conflicting values for variable {!r} on objects to be combined. " - "You can skip this check by specifying compat='override'.".format(name) + f"conflicting values for variable {name!r} on objects to be combined. " + "You can skip this check by specifying compat='override'." ) if combine_method: diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 117ab85ae65..916df75b3e0 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1236,7 +1236,9 @@ def transpose(self, *dims) -> "Variable": dims = self.dims[::-1] dims = tuple(infix_dims(dims, self.dims)) axes = self.get_axis_num(dims) - if len(dims) < 2: # no need to transpose if only one dimension + if len(dims) < 2 or dims == self.dims: + # no need to transpose if only one dimension + # or dims are in same order return self.copy(deep=False) data = as_indexable(self._data).transpose(axes) @@ -1595,22 +1597,24 @@ def broadcast_equals(self, other, equiv=duck_array_ops.array_equiv): return False return self.equals(other, equiv=equiv) - def identical(self, other): + def identical(self, other, equiv=duck_array_ops.array_equiv): """Like equals, but also checks attributes. """ try: - return utils.dict_equiv(self.attrs, other.attrs) and self.equals(other) + return utils.dict_equiv(self.attrs, other.attrs) and self.equals( + other, equiv=equiv + ) except (TypeError, AttributeError): return False - def no_conflicts(self, other): + def no_conflicts(self, other, equiv=duck_array_ops.array_notnull_equiv): """True if the intersection of two Variable's non-null data is equal; otherwise false. Variables can thus still be equal if there are locations where either, or both, contain NaN values. """ - return self.broadcast_equals(other, equiv=duck_array_ops.array_notnull_equiv) + return self.broadcast_equals(other, equiv=equiv) def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): """Compute the qth quantile of the data along the specified dimension. diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index c4323d1d317..34115b29b23 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -24,6 +24,7 @@ raises_regex, requires_scipy_or_netCDF4, ) +from ..core.duck_array_ops import lazy_array_equiv from .test_backends import create_tmp_file dask = pytest.importorskip("dask") @@ -428,7 +429,53 @@ def test_concat_loads_variables(self): out.compute() assert kernel_call_count == 24 - # Finally, test that riginals are unaltered + # Finally, test that originals are unaltered + assert ds1["d"].data is d1 + assert ds1["c"].data is c1 + assert ds2["d"].data is d2 + assert ds2["c"].data is c2 + assert ds3["d"].data is d3 + assert ds3["c"].data is c3 + + # now check that concat() is correctly using dask name equality to skip loads + out = xr.concat( + [ds1, ds1, ds1], dim="n", data_vars="different", coords="different" + ) + assert kernel_call_count == 24 + # variables are not loaded in the output + assert isinstance(out["d"].data, dask.array.Array) + assert isinstance(out["c"].data, dask.array.Array) + + out = xr.concat( + [ds1, ds1, ds1], dim="n", data_vars=[], coords=[], compat="identical" + ) + assert kernel_call_count == 24 + # variables are not loaded in the output + assert isinstance(out["d"].data, dask.array.Array) + assert isinstance(out["c"].data, dask.array.Array) + + out = xr.concat( + [ds1, ds2.compute(), ds3], + dim="n", + data_vars="all", + coords="different", + compat="identical", + ) + # c1,c3 must be computed for comparison since c2 is numpy; + # d2 is computed too + assert kernel_call_count == 28 + + out = xr.concat( + [ds1, ds2.compute(), ds3], + dim="n", + data_vars="all", + coords="all", + compat="identical", + ) + # no extra computes + assert kernel_call_count == 30 + + # Finally, test that originals are unaltered assert ds1["d"].data is d1 assert ds1["c"].data is c1 assert ds2["d"].data is d2 @@ -1142,6 +1189,19 @@ def test_make_meta(map_ds): assert meta.data_vars[variable].shape == (0,) * meta.data_vars[variable].ndim +def test_identical_coords_no_computes(): + lons2 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x")) + a = xr.DataArray( + da.zeros((10, 10), chunks=2), dims=("y", "x"), coords={"lons": lons2} + ) + b = xr.DataArray( + da.zeros((10, 10), chunks=2), dims=("y", "x"), coords={"lons": lons2} + ) + with raise_if_dask_computes(): + c = a + b + assert_identical(c, a) + + @pytest.mark.parametrize( "obj", [make_da(), make_da().compute(), make_ds(), make_ds().compute()] ) @@ -1229,3 +1289,49 @@ def test_normalize_token_with_backend(map_ds): map_ds.to_netcdf(tmp_file) read = xr.open_dataset(tmp_file) assert not dask.base.tokenize(map_ds) == dask.base.tokenize(read) + + +@pytest.mark.parametrize( + "compat", ["broadcast_equals", "equals", "identical", "no_conflicts"] +) +def test_lazy_array_equiv_variables(compat): + var1 = xr.Variable(("y", "x"), da.zeros((10, 10), chunks=2)) + var2 = xr.Variable(("y", "x"), da.zeros((10, 10), chunks=2)) + var3 = xr.Variable(("y", "x"), da.zeros((20, 10), chunks=2)) + + with raise_if_dask_computes(): + assert getattr(var1, compat)(var2, equiv=lazy_array_equiv) + # values are actually equal, but we don't know that till we compute, return None + with raise_if_dask_computes(): + assert getattr(var1, compat)(var2 / 2, equiv=lazy_array_equiv) is None + + # shapes are not equal, return False without computes + with raise_if_dask_computes(): + assert getattr(var1, compat)(var3, equiv=lazy_array_equiv) is False + + # if one or both arrays are numpy, return None + assert getattr(var1, compat)(var2.compute(), equiv=lazy_array_equiv) is None + assert ( + getattr(var1.compute(), compat)(var2.compute(), equiv=lazy_array_equiv) is None + ) + + with raise_if_dask_computes(): + assert getattr(var1, compat)(var2.transpose("y", "x")) + + +@pytest.mark.parametrize( + "compat", ["broadcast_equals", "equals", "identical", "no_conflicts"] +) +def test_lazy_array_equiv_merge(compat): + da1 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x")) + da2 = xr.DataArray(da.zeros((10, 10), chunks=2), dims=("y", "x")) + da3 = xr.DataArray(da.ones((20, 10), chunks=2), dims=("y", "x")) + + with raise_if_dask_computes(): + xr.merge([da1, da2], compat=compat) + # shapes are not equal; no computes necessary + with raise_if_dask_computes(max_computes=0): + with pytest.raises(ValueError): + xr.merge([da1, da3], compat=compat) + with raise_if_dask_computes(max_computes=2): + xr.merge([da1, da2 / 2], compat=compat) From 4dce93f134e8296ea730104b46ce3372b90304ac Mon Sep 17 00:00:00 2001 From: barronh Date: Tue, 5 Nov 2019 10:42:34 -0500 Subject: [PATCH 2/3] uamiv test using only raw uamiv variables (#3485) * uamiv test using only raw uamiv variables Previous test relied on CF generated metadata, but this test is more robust. * uamiv test using only raw uamiv variables Previous test relied on CF generated metadata, but this test is more robust. * uamiv test using only raw uamiv variables Previous test relied on CF generated metadata, but this test is more robust. * uamiv test using only raw uamiv variables --- xarray/tests/test_backends.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 4bdebe73050..9b000b82b03 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -3400,20 +3400,17 @@ def test_uamiv_format_read(self): actual = camxfile.variables["O3"] assert_allclose(expected, actual) - data = np.array(["2002-06-03"], "datetime64[ns]") + data = np.array([[[2002154, 0]]], dtype="i") expected = xr.Variable( - ("TSTEP",), + ("TSTEP", "VAR", "DATE-TIME"), data, dict( - bounds="time_bounds", - long_name=( - "synthesized time coordinate " - + "from SDATE, STIME, STEP " - + "global attributes" - ), + long_name="TFLAG".ljust(16), + var_desc="TFLAG".ljust(80), + units="DATE-TIME".ljust(16), ), ) - actual = camxfile.variables["time"] + actual = camxfile.variables["TFLAG"] assert_allclose(expected, actual) camxfile.close() @@ -3439,18 +3436,15 @@ def test_uamiv_format_mfread(self): actual = camxfile.variables["O3"] assert_allclose(expected, actual) - data1 = np.array(["2002-06-03"], "datetime64[ns]") - data = np.concatenate([data1] * 2, axis=0) + data = np.array([[[2002154, 0]]], dtype="i").repeat(2, 0) attrs = dict( - bounds="time_bounds", - long_name=( - "synthesized time coordinate " - + "from SDATE, STIME, STEP " - + "global attributes" - ), + long_name="TFLAG".ljust(16), + var_desc="TFLAG".ljust(80), + units="DATE-TIME".ljust(16), ) - expected = xr.Variable(("TSTEP",), data, attrs) - actual = camxfile.variables["time"] + dims = ("TSTEP", "VAR", "DATE-TIME") + expected = xr.Variable(dims, data, attrs) + actual = camxfile.variables["TFLAG"] assert_allclose(expected, actual) camxfile.close() From 0e8debfe28286b5fe1f3d27e8dcc8466a62aca6d Mon Sep 17 00:00:00 2001 From: Maximilian Roos <5635139+max-sixty@users.noreply.github.com> Date: Thu, 7 Nov 2019 15:13:50 -0500 Subject: [PATCH 3/3] drop_vars; deprecate drop for variables (#3475) * Deprecate drop for vars, in favor of drop_vars * docs tweaks * handle scalars as vars * allow warning in old whatsnew * add drop_sel, adjust deprecations based on comments * whatsnew * docs * old-whatsnew * docstring * pendingdeprecationwarning * whatsnew * whatsnew * move units tests to drop_sel * is_scalar (but retain isinstance for mypy) --- doc/data-structures.rst | 4 +- doc/indexing.rst | 6 +- doc/whats-new.rst | 7 ++ xarray/core/concat.py | 2 +- xarray/core/dataarray.py | 78 ++++++++---- xarray/core/dataset.py | 180 +++++++++++++++------------- xarray/core/groupby.py | 2 +- xarray/core/merge.py | 2 +- xarray/core/resample.py | 6 +- xarray/tests/test_backends.py | 8 +- xarray/tests/test_dask.py | 10 +- xarray/tests/test_dataarray.py | 45 +++---- xarray/tests/test_dataset.py | 115 +++++++++++------- xarray/tests/test_duck_array_ops.py | 3 +- xarray/tests/test_interp.py | 2 +- xarray/tests/test_plot.py | 6 +- xarray/tests/test_units.py | 6 +- 17 files changed, 286 insertions(+), 196 deletions(-) diff --git a/doc/data-structures.rst b/doc/data-structures.rst index d5567f4863e..93cdc7e9765 100644 --- a/doc/data-structures.rst +++ b/doc/data-structures.rst @@ -393,14 +393,14 @@ methods (like pandas) for transforming datasets into new objects. For removing variables, you can select and drop an explicit list of variables by indexing with a list of names or using the -:py:meth:`~xarray.Dataset.drop` methods to return a new ``Dataset``. These +:py:meth:`~xarray.Dataset.drop_vars` methods to return a new ``Dataset``. These operations keep around coordinates: .. ipython:: python ds[['temperature']] ds[['temperature', 'temperature_double']] - ds.drop('temperature') + ds.drop_vars('temperature') To remove a dimension, you can use :py:meth:`~xarray.Dataset.drop_dims` method. Any variables using that dimension are dropped: diff --git a/doc/indexing.rst b/doc/indexing.rst index 9ee8f1dddf8..ace960689a8 100644 --- a/doc/indexing.rst +++ b/doc/indexing.rst @@ -232,14 +232,14 @@ Using indexing to *assign* values to a subset of dataset (e.g., Dropping labels and dimensions ------------------------------ -The :py:meth:`~xarray.Dataset.drop` method returns a new object with the listed +The :py:meth:`~xarray.Dataset.drop_sel` method returns a new object with the listed index labels along a dimension dropped: .. ipython:: python - ds.drop(space=['IN', 'IL']) + ds.drop_sel(space=['IN', 'IL']) -``drop`` is both a ``Dataset`` and ``DataArray`` method. +``drop_sel`` is both a ``Dataset`` and ``DataArray`` method. Use :py:meth:`~xarray.Dataset.drop_dims` to drop a full dimension from a Dataset. Any variables with these dimensions are also dropped: diff --git a/doc/whats-new.rst b/doc/whats-new.rst index dcaab011e67..0906058469d 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -38,6 +38,12 @@ Breaking changes New Features ~~~~~~~~~~~~ +- :py:meth:`Dataset.drop_sel` & :py:meth:`DataArray.drop_sel` have been added for dropping labels. + :py:meth:`Dataset.drop_vars` & :py:meth:`DataArray.drop_vars` have been added for + dropping variables (including coordinates). The existing ``drop`` methods remain as a backward compatible + option for dropping either lables or variables, but using the more specific methods is encouraged. + (:pull:`3475`) + By `Maximilian Roos `_ - :py:meth:`Dataset.transpose` and :py:meth:`DataArray.transpose` now support an ellipsis (`...`) to represent all 'other' dimensions. For example, to move one dimension to the front, use `.transpose('x', ...)`. (:pull:`3421`) @@ -3752,6 +3758,7 @@ Enhancements explicitly listed variables or index labels: .. ipython:: python + :okwarning: # drop variables ds = xray.Dataset({'x': 0, 'y': 1}) diff --git a/xarray/core/concat.py b/xarray/core/concat.py index c26153eb0d8..5b4fc078236 100644 --- a/xarray/core/concat.py +++ b/xarray/core/concat.py @@ -388,7 +388,7 @@ def ensure_common_dims(vars): result = result.set_coords(coord_names) result.encoding = result_encoding - result = result.drop(unlabeled_dims, errors="ignore") + result = result.drop_vars(unlabeled_dims, errors="ignore") if coord is not None: # add concat dimension last to ensure that its in the final Dataset diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index 35ee90fb5c8..d2d37871ee9 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -16,7 +16,6 @@ TypeVar, Union, cast, - overload, ) import numpy as np @@ -53,7 +52,7 @@ from .formatting import format_item from .indexes import Indexes, default_indexes from .options import OPTIONS -from .utils import Default, ReprObject, _default, _check_inplace, either_dict_or_kwargs +from .utils import Default, ReprObject, _check_inplace, _default, either_dict_or_kwargs from .variable import ( IndexVariable, Variable, @@ -249,7 +248,7 @@ class DataArray(AbstractArray, DataWithCoords): Dictionary for holding arbitrary metadata. """ - _accessors: Optional[Dict[str, Any]] + _accessors: Optional[Dict[str, Any]] # noqa _coords: Dict[Any, Variable] _indexes: Optional[Dict[Hashable, pd.Index]] _name: Optional[Hashable] @@ -1890,41 +1889,72 @@ def transpose(self, *dims: Hashable, transpose_coords: bool = None) -> "DataArra def T(self) -> "DataArray": return self.transpose() - # Drop coords - @overload - def drop( - self, labels: Union[Hashable, Iterable[Hashable]], *, errors: str = "raise" + def drop_vars( + self, names: Union[Hashable, Iterable[Hashable]], *, errors: str = "raise" ) -> "DataArray": - ... + """Drop variables from this DataArray. + + Parameters + ---------- + names : hashable or iterable of hashables + Name(s) of variables to drop. + errors: {'raise', 'ignore'}, optional + If 'raise' (default), raises a ValueError error if any of the variable + passed are not in the dataset. If 'ignore', any given names that are in the + DataArray are dropped and no error is raised. + + Returns + ------- + dropped : Dataset + + """ + ds = self._to_temp_dataset().drop_vars(names, errors=errors) + return self._from_temp_dataset(ds) - # Drop index labels along dimension - @overload # noqa: F811 def drop( - self, labels: Any, dim: Hashable, *, errors: str = "raise" # array-like + self, + labels: Mapping = None, + dim: Hashable = None, + *, + errors: str = "raise", + **labels_kwargs, ) -> "DataArray": - ... + """Backward compatible method based on `drop_vars` and `drop_sel` - def drop(self, labels, dim=None, *, errors="raise"): # noqa: F811 - """Drop coordinates or index labels from this DataArray. + Using either `drop_vars` or `drop_sel` is encouraged + """ + ds = self._to_temp_dataset().drop(labels, dim, errors=errors) + return self._from_temp_dataset(ds) + + def drop_sel( + self, + labels: Mapping[Hashable, Any] = None, + *, + errors: str = "raise", + **labels_kwargs, + ) -> "DataArray": + """Drop index labels from this DataArray. Parameters ---------- - labels : hashable or sequence of hashables - Name(s) of coordinates or index labels to drop. - If dim is not None, labels can be any array-like. - dim : hashable, optional - Dimension along which to drop index labels. By default (if - ``dim is None``), drops coordinates rather than index labels. + labels : Mapping[Hashable, Any] + Index labels to drop errors: {'raise', 'ignore'}, optional If 'raise' (default), raises a ValueError error if - any of the coordinates or index labels passed are not - in the array. If 'ignore', any given labels that are in the - array are dropped and no error is raised. + any of the index labels passed are not + in the dataset. If 'ignore', any given labels that are in the + dataset are dropped and no error is raised. + **labels_kwargs : {dim: label, ...}, optional + The keyword arguments form of ``dim`` and ``labels`` + Returns ------- dropped : DataArray """ - ds = self._to_temp_dataset().drop(labels, dim, errors=errors) + if labels_kwargs or isinstance(labels, dict): + labels = either_dict_or_kwargs(labels, labels_kwargs, "drop") + + ds = self._to_temp_dataset().drop_sel(labels, errors=errors) return self._from_temp_dataset(ds) def dropna( diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 978242e5f6b..2cadc90334c 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -25,7 +25,6 @@ TypeVar, Union, cast, - overload, ) import numpy as np @@ -80,6 +79,7 @@ hashable, is_dict_like, is_list_like, + is_scalar, maybe_wrap_array, ) from .variable import IndexVariable, Variable, as_variable, broadcast_variables @@ -3519,39 +3519,98 @@ def _assert_all_in_dataset( "cannot be found in this dataset" ) - # Drop variables - @overload # noqa: F811 - def drop( - self, labels: Union[Hashable, Iterable[Hashable]], *, errors: str = "raise" + def drop_vars( + self, names: Union[Hashable, Iterable[Hashable]], *, errors: str = "raise" ) -> "Dataset": - ... + """Drop variables from this dataset. - # Drop index labels along dimension - @overload # noqa: F811 - def drop( - self, labels: Any, dim: Hashable, *, errors: str = "raise" # array-like - ) -> "Dataset": - ... + Parameters + ---------- + names : hashable or iterable of hashables + Name(s) of variables to drop. + errors: {'raise', 'ignore'}, optional + If 'raise' (default), raises a ValueError error if any of the variable + passed are not in the dataset. If 'ignore', any given names that are in the + dataset are dropped and no error is raised. - def drop( # noqa: F811 - self, labels=None, dim=None, *, errors="raise", **labels_kwargs - ): - """Drop variables or index labels from this dataset. + Returns + ------- + dropped : Dataset + + """ + # the Iterable check is required for mypy + if is_scalar(names) or not isinstance(names, Iterable): + names = {names} + else: + names = set(names) + if errors == "raise": + self._assert_all_in_dataset(names) + + variables = {k: v for k, v in self._variables.items() if k not in names} + coord_names = {k for k in self._coord_names if k in variables} + indexes = {k: v for k, v in self.indexes.items() if k not in names} + return self._replace_with_new_dims( + variables, coord_names=coord_names, indexes=indexes + ) + + def drop(self, labels=None, dim=None, *, errors="raise", **labels_kwargs): + """Backward compatible method based on `drop_vars` and `drop_sel` + + Using either `drop_vars` or `drop_sel` is encouraged + """ + if errors not in ["raise", "ignore"]: + raise ValueError('errors must be either "raise" or "ignore"') + + if is_dict_like(labels) and not isinstance(labels, dict): + warnings.warn( + "dropping coordinates using `drop` is be deprecated; use drop_vars.", + FutureWarning, + stacklevel=2, + ) + return self.drop_vars(labels, errors=errors) + + if labels_kwargs or isinstance(labels, dict): + if dim is not None: + raise ValueError("cannot specify dim and dict-like arguments.") + labels = either_dict_or_kwargs(labels, labels_kwargs, "drop") + + if dim is None and (is_list_like(labels) or is_scalar(labels)): + warnings.warn( + "dropping variables using `drop` will be deprecated; using drop_vars is encouraged.", + PendingDeprecationWarning, + stacklevel=2, + ) + return self.drop_vars(labels, errors=errors) + if dim is not None: + warnings.warn( + "dropping labels using list-like labels is deprecated; using " + "dict-like arguments with `drop_sel`, e.g. `ds.drop_sel(dim=[labels]).", + DeprecationWarning, + stacklevel=2, + ) + return self.drop_sel({dim: labels}, errors=errors, **labels_kwargs) + + warnings.warn( + "dropping labels using `drop` will be deprecated; using drop_sel is encouraged.", + PendingDeprecationWarning, + stacklevel=2, + ) + return self.drop_sel(labels, errors=errors) + + def drop_sel(self, labels=None, *, errors="raise", **labels_kwargs): + """Drop index labels from this dataset. Parameters ---------- - labels : hashable or iterable of hashables - Name(s) of variables or index labels to drop. - dim : None or hashable, optional - Dimension along which to drop index labels. By default (if - ``dim is None``), drops variables rather than index labels. + labels : Mapping[Hashable, Any] + Index labels to drop errors: {'raise', 'ignore'}, optional If 'raise' (default), raises a ValueError error if - any of the variable or index labels passed are not + any of the index labels passed are not in the dataset. If 'ignore', any given labels that are in the dataset are dropped and no error is raised. **labels_kwargs : {dim: label, ...}, optional - The keyword arguments form of ``dim`` and ``labels``. + The keyword arguments form of ``dim`` and ``labels` Returns ------- @@ -3562,7 +3621,7 @@ def drop( # noqa: F811 >>> data = np.random.randn(2, 3) >>> labels = ['a', 'b', 'c'] >>> ds = xr.Dataset({'A': (['x', 'y'], data), 'y': labels}) - >>> ds.drop(y=['a', 'c']) + >>> ds.drop_sel(y=['a', 'c']) Dimensions: (x: 2, y: 1) Coordinates: @@ -3570,7 +3629,7 @@ def drop( # noqa: F811 Dimensions without coordinates: x Data variables: A (x, y) float64 -0.3454 0.1734 - >>> ds.drop(y='b') + >>> ds.drop_sel(y='b') Dimensions: (x: 2, y: 2) Coordinates: @@ -3582,61 +3641,22 @@ def drop( # noqa: F811 if errors not in ["raise", "ignore"]: raise ValueError('errors must be either "raise" or "ignore"') - if is_dict_like(labels) and not isinstance(labels, dict): - warnings.warn( - "dropping coordinates using key values of dict-like labels is " - "deprecated; use drop_vars or a list of coordinates.", - FutureWarning, - stacklevel=2, - ) - if dim is not None and is_list_like(labels): - warnings.warn( - "dropping dimensions using list-like labels is deprecated; use " - "dict-like arguments.", - DeprecationWarning, - stacklevel=2, - ) + labels = either_dict_or_kwargs(labels, labels_kwargs, "drop") - if labels_kwargs or isinstance(labels, dict): - labels_kwargs = either_dict_or_kwargs(labels, labels_kwargs, "drop") - if dim is not None: - raise ValueError("cannot specify dim and dict-like arguments.") - ds = self - for dim, labels in labels_kwargs.items(): - ds = ds._drop_labels(labels, dim, errors=errors) - return ds - elif dim is None: - if isinstance(labels, str) or not isinstance(labels, Iterable): - labels = {labels} - else: - labels = set(labels) - return self._drop_vars(labels, errors=errors) - else: - return self._drop_labels(labels, dim, errors=errors) - - def _drop_labels(self, labels=None, dim=None, errors="raise"): - # Don't cast to set, as it would harm performance when labels - # is a large numpy array - if utils.is_scalar(labels): - labels = [labels] - labels = np.asarray(labels) - try: - index = self.indexes[dim] - except KeyError: - raise ValueError("dimension %r does not have coordinate labels" % dim) - new_index = index.drop(labels, errors=errors) - return self.loc[{dim: new_index}] - - def _drop_vars(self, names: set, errors: str = "raise") -> "Dataset": - if errors == "raise": - self._assert_all_in_dataset(names) - - variables = {k: v for k, v in self._variables.items() if k not in names} - coord_names = {k for k in self._coord_names if k in variables} - indexes = {k: v for k, v in self.indexes.items() if k not in names} - return self._replace_with_new_dims( - variables, coord_names=coord_names, indexes=indexes - ) + ds = self + for dim, labels_for_dim in labels.items(): + # Don't cast to set, as it would harm performance when labels + # is a large numpy array + if utils.is_scalar(labels_for_dim): + labels_for_dim = [labels_for_dim] + labels_for_dim = np.asarray(labels_for_dim) + try: + index = self.indexes[dim] + except KeyError: + raise ValueError("dimension %r does not have coordinate labels" % dim) + new_index = index.drop(labels_for_dim, errors=errors) + ds = ds.loc[{dim: new_index}] + return ds def drop_dims( self, drop_dims: Union[Hashable, Iterable[Hashable]], *, errors: str = "raise" @@ -3679,7 +3699,7 @@ def drop_dims( ) drop_vars = {k for k, v in self._variables.items() if set(v.dims) & drop_dims} - return self._drop_vars(drop_vars) + return self.drop_vars(drop_vars) def transpose(self, *dims: Hashable) -> "Dataset": """Return a new Dataset object with all array dimensions transposed. diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 209ac14184b..c8906e34737 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -775,7 +775,7 @@ def quantile(self, q, dim=None, interpolation="linear", keep_attrs=None): ) if np.asarray(q, dtype=np.float64).ndim == 0: - out = out.drop("quantile") + out = out.drop_vars("quantile") return out def reduce( diff --git a/xarray/core/merge.py b/xarray/core/merge.py index daf0c3b059f..10c7804d718 100644 --- a/xarray/core/merge.py +++ b/xarray/core/merge.py @@ -859,6 +859,6 @@ def dataset_update_method( if c not in value.dims and c in dataset.coords ] if coord_names: - other[key] = value.drop(coord_names) + other[key] = value.drop_vars(coord_names) return merge_core([dataset, other], priority_arg=1, indexes=dataset.indexes) diff --git a/xarray/core/resample.py b/xarray/core/resample.py index 998964273be..2cb1bd55e19 100644 --- a/xarray/core/resample.py +++ b/xarray/core/resample.py @@ -47,7 +47,7 @@ def _upsample(self, method, *args, **kwargs): if k == self._dim: continue if self._dim in v.dims: - self._obj = self._obj.drop(k) + self._obj = self._obj.drop_vars(k) if method == "asfreq": return self.mean(self._dim) @@ -146,7 +146,7 @@ def _interpolate(self, kind="linear"): dummy = self._obj.copy() for k, v in self._obj.coords.items(): if k != self._dim and self._dim in v.dims: - dummy = dummy.drop(k) + dummy = dummy.drop_vars(k) return dummy.interp( assume_sorted=True, method=kind, @@ -218,7 +218,7 @@ def apply(self, func, shortcut=False, args=(), **kwargs): # dimension, then we need to do so before we can rename the proxy # dimension we used. if self._dim in combined.coords: - combined = combined.drop(self._dim) + combined = combined.drop_vars(self._dim) if self._resample_dim in combined.dims: combined = combined.rename({self._resample_dim: self._dim}) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 9b000b82b03..de3a7eadab0 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -800,7 +800,7 @@ def equals_latlon(obj): assert "coordinates" not in ds["lat"].attrs assert "coordinates" not in ds["lon"].attrs - modified = original.drop(["temp", "precip"]) + modified = original.drop_vars(["temp", "precip"]) with self.roundtrip(modified) as actual: assert_identical(actual, modified) with create_tmp_file() as tmp_file: @@ -2177,7 +2177,7 @@ def test_cross_engine_read_write_netcdf4(self): # Drop dim3, because its labels include strings. These appear to be # not properly read with python-netCDF4, which converts them into # unicode instead of leaving them as bytes. - data = create_test_data().drop("dim3") + data = create_test_data().drop_vars("dim3") data.attrs["foo"] = "bar" valid_engines = ["netcdf4", "h5netcdf"] for write_engine in valid_engines: @@ -2344,7 +2344,7 @@ def test_open_twice(self): def test_open_fileobj(self): # open in-memory datasets instead of local file paths - expected = create_test_data().drop("dim3") + expected = create_test_data().drop_vars("dim3") expected.attrs["foo"] = "bar" with create_tmp_file() as tmp_file: expected.to_netcdf(tmp_file, engine="h5netcdf") @@ -4190,7 +4190,7 @@ def test_open_dataarray_options(self): with create_tmp_file() as tmp: data.to_netcdf(tmp) - expected = data.drop("y") + expected = data.drop_vars("y") with open_dataarray(tmp, drop_variables=["y"]) as loaded: assert_identical(expected, loaded) diff --git a/xarray/tests/test_dask.py b/xarray/tests/test_dask.py index 34115b29b23..fa8ae9991d7 100644 --- a/xarray/tests/test_dask.py +++ b/xarray/tests/test_dask.py @@ -1129,11 +1129,11 @@ def test_map_blocks_to_array(map_ds): [ lambda x: x, lambda x: x.to_dataset(), - lambda x: x.drop("x"), + lambda x: x.drop_vars("x"), lambda x: x.expand_dims(k=[1, 2, 3]), lambda x: x.assign_coords(new_coord=("y", x.y * 2)), lambda x: x.astype(np.int32), - # TODO: [lambda x: x.isel(x=1).drop("x"), map_da], + # TODO: [lambda x: x.isel(x=1).drop_vars("x"), map_da], ], ) def test_map_blocks_da_transformations(func, map_da): @@ -1147,9 +1147,9 @@ def test_map_blocks_da_transformations(func, map_da): "func", [ lambda x: x, - lambda x: x.drop("cxy"), - lambda x: x.drop("a"), - lambda x: x.drop("x"), + lambda x: x.drop_vars("cxy"), + lambda x: x.drop_vars("a"), + lambda x: x.drop_vars("x"), lambda x: x.expand_dims(k=[1, 2, 3]), lambda x: x.rename({"a": "new1", "b": "new2"}), # TODO: [lambda x: x.isel(x=1)], diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 2c823b0c20a..acfe684d220 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -906,7 +906,7 @@ def test_sel_dataarray(self): assert_array_equal(actual, da.isel(x=[0, 1, 2])) assert "new_dim" in actual.dims assert "new_dim" in actual.coords - assert_equal(actual["new_dim"].drop("x"), ind["new_dim"]) + assert_equal(actual["new_dim"].drop_vars("x"), ind["new_dim"]) def test_sel_invalid_slice(self): array = DataArray(np.arange(10), [("x", np.arange(10))]) @@ -1660,7 +1660,7 @@ def test_expand_dims_with_greater_dim_size(self): coords=expected_coords, dims=list(expected_coords.keys()), attrs={"key": "entry"}, - ).drop(["y", "dim_0"]) + ).drop_vars(["y", "dim_0"]) assert_identical(expected, actual) # Test with kwargs instead of passing dict to dim arg. @@ -1677,7 +1677,7 @@ def test_expand_dims_with_greater_dim_size(self): }, dims=["dim_1", "x", "dim_0"], attrs={"key": "entry"}, - ).drop("dim_0") + ).drop_vars("dim_0") assert_identical(other_way_expected, other_way) def test_set_index(self): @@ -1993,7 +1993,7 @@ def test_stack_unstack(self): ) pd.util.testing.assert_index_equal(a, b) - actual = orig.stack(z=["x", "y"]).unstack("z").drop(["x", "y"]) + actual = orig.stack(z=["x", "y"]).unstack("z").drop_vars(["x", "y"]) assert_identical(orig, actual) dims = ["a", "b", "c", "d", "e"] @@ -2001,11 +2001,11 @@ def test_stack_unstack(self): stacked = orig.stack(ab=["a", "b"], cd=["c", "d"]) unstacked = stacked.unstack(["ab", "cd"]) - roundtripped = unstacked.drop(["a", "b", "c", "d"]).transpose(*dims) + roundtripped = unstacked.drop_vars(["a", "b", "c", "d"]).transpose(*dims) assert_identical(orig, roundtripped) unstacked = stacked.unstack() - roundtripped = unstacked.drop(["a", "b", "c", "d"]).transpose(*dims) + roundtripped = unstacked.drop_vars(["a", "b", "c", "d"]).transpose(*dims) assert_identical(orig, roundtripped) def test_stack_unstack_decreasing_coordinate(self): @@ -2109,40 +2109,43 @@ def test_drop_coordinates(self): expected = DataArray(np.random.randn(2, 3), dims=["x", "y"]) arr = expected.copy() arr.coords["z"] = 2 - actual = arr.drop("z") + actual = arr.drop_vars("z") assert_identical(expected, actual) with pytest.raises(ValueError): - arr.drop("not found") + arr.drop_vars("not found") - actual = expected.drop("not found", errors="ignore") + actual = expected.drop_vars("not found", errors="ignore") assert_identical(actual, expected) with raises_regex(ValueError, "cannot be found"): - arr.drop("w") + arr.drop_vars("w") - actual = expected.drop("w", errors="ignore") + actual = expected.drop_vars("w", errors="ignore") assert_identical(actual, expected) renamed = arr.rename("foo") with raises_regex(ValueError, "cannot be found"): - renamed.drop("foo") + renamed.drop_vars("foo") - actual = renamed.drop("foo", errors="ignore") + actual = renamed.drop_vars("foo", errors="ignore") assert_identical(actual, renamed) def test_drop_index_labels(self): arr = DataArray(np.random.randn(2, 3), coords={"y": [0, 1, 2]}, dims=["x", "y"]) - actual = arr.drop([0, 1], dim="y") + actual = arr.drop_sel(y=[0, 1]) expected = arr[:, 2:] assert_identical(actual, expected) with raises_regex((KeyError, ValueError), "not .* in axis"): - actual = arr.drop([0, 1, 3], dim="y") + actual = arr.drop_sel(y=[0, 1, 3]) - actual = arr.drop([0, 1, 3], dim="y", errors="ignore") + actual = arr.drop_sel(y=[0, 1, 3], errors="ignore") assert_identical(actual, expected) + with pytest.warns(DeprecationWarning): + arr.drop([0, 1, 3], dim="y", errors="ignore") + def test_dropna(self): x = np.random.randn(4, 4) x[::2, 0] = np.nan @@ -3360,7 +3363,7 @@ def test_to_pandas(self): da = DataArray(np.random.randn(*shape), dims=dims) with warnings.catch_warnings(): warnings.filterwarnings("ignore", r"\W*Panel is deprecated") - roundtripped = DataArray(da.to_pandas()).drop(dims) + roundtripped = DataArray(da.to_pandas()).drop_vars(dims) assert_identical(da, roundtripped) with raises_regex(ValueError, "cannot convert"): @@ -3411,11 +3414,13 @@ def test_to_and_from_series(self): assert_array_equal(expected.index.values, actual.index.values) assert "foo" == actual.name # test roundtrip - assert_identical(self.dv, DataArray.from_series(actual).drop(["x", "y"])) + assert_identical(self.dv, DataArray.from_series(actual).drop_vars(["x", "y"])) # test name is None actual.name = None expected_da = self.dv.rename(None) - assert_identical(expected_da, DataArray.from_series(actual).drop(["x", "y"])) + assert_identical( + expected_da, DataArray.from_series(actual).drop_vars(["x", "y"]) + ) @requires_sparse def test_from_series_sparse(self): @@ -3478,7 +3483,7 @@ def test_to_and_from_dict(self): # and the most bare bones representation still roundtrips d = {"name": "foo", "dims": ("x", "y"), "data": array.values} - assert_identical(array.drop("x"), DataArray.from_dict(d)) + assert_identical(array.drop_vars("x"), DataArray.from_dict(d)) # missing a dims in the coords d = { diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index b9fa20fab26..50e78c9f685 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -322,7 +322,7 @@ def __repr__(self): def test_info(self): ds = create_test_data(seed=123) - ds = ds.drop("dim3") # string type prints differently in PY2 vs PY3 + ds = ds.drop_vars("dim3") # string type prints differently in PY2 vs PY3 ds.attrs["unicode_attr"] = "ba®" ds.attrs["string_attr"] = "bar" @@ -509,7 +509,9 @@ def test_constructor_compat(self): {"c": (("x", "y"), np.zeros((2, 3))), "x": [0, 1]}, ) - actual = Dataset({"a": original["a"][:, 0], "b": original["a"][0].drop("x")}) + actual = Dataset( + {"a": original["a"][:, 0], "b": original["a"][0].drop_vars("x")} + ) assert_identical(expected, actual) data = {"x": DataArray(0, coords={"y": 3}), "y": ("z", [1, 1, 1])} @@ -775,9 +777,9 @@ def test_coords_set(self): one_coord.reset_coords("x") actual = all_coords.reset_coords("zzz", drop=True) - expected = all_coords.drop("zzz") + expected = all_coords.drop_vars("zzz") assert_identical(expected, actual) - expected = two_coords.drop("zzz") + expected = two_coords.drop_vars("zzz") assert_identical(expected, actual) def test_coords_to_dataset(self): @@ -954,7 +956,7 @@ def test_dask_is_lazy(self): ds.fillna(0) ds.rename({"dim1": "foobar"}) ds.set_coords("var1") - ds.drop("var1") + ds.drop_vars("var1") def test_isel(self): data = create_test_data() @@ -1097,7 +1099,7 @@ def test_isel_fancy(self): actual = data.isel(dim1=stations["dim1s"], dim2=stations["dim2s"]) assert "station" in actual.coords assert "station" in actual.dims - assert_identical(actual["station"].drop(["dim2"]), stations["station"]) + assert_identical(actual["station"].drop_vars(["dim2"]), stations["station"]) with raises_regex(ValueError, "conflicting values for "): data.isel( @@ -1123,7 +1125,7 @@ def test_isel_fancy(self): assert "dim2" in actual.coords assert "a" in actual["dim2"].dims - assert_identical(actual["a"].drop(["dim2"]), stations["a"]) + assert_identical(actual["a"].drop_vars(["dim2"]), stations["a"]) assert_identical(actual["b"], stations["b"]) expected_var1 = data["var1"].variable[ stations["dim1s"].variable, stations["dim2s"].variable @@ -1132,7 +1134,7 @@ def test_isel_fancy(self): stations["dim1s"].variable, stations["dim2s"].variable ] expected_var3 = data["var3"].variable[slice(None), stations["dim1s"].variable] - assert_equal(actual["a"].drop("dim2"), stations["a"]) + assert_equal(actual["a"].drop_vars("dim2"), stations["a"]) assert_array_equal(actual["var1"], expected_var1) assert_array_equal(actual["var2"], expected_var2) assert_array_equal(actual["var3"], expected_var3) @@ -1200,7 +1202,7 @@ def test_isel_dataarray(self): indexing_da = indexing_da < 3 actual = data.isel(dim2=indexing_da) assert_identical( - actual["dim2"].drop("non_dim").drop("non_dim2"), data["dim2"][:2] + actual["dim2"].drop_vars("non_dim").drop_vars("non_dim2"), data["dim2"][:2] ) assert_identical(actual["non_dim"], indexing_da["non_dim"][:2]) assert_identical(actual["non_dim2"], indexing_da["non_dim2"]) @@ -1286,8 +1288,10 @@ def test_sel_dataarray(self): expected = data.isel(dim2=[0, 1, 2]).rename({"dim2": "new_dim"}) assert "new_dim" in actual.dims assert "new_dim" in actual.coords - assert_equal(actual.drop("new_dim").drop("dim2"), expected.drop("new_dim")) - assert_equal(actual["new_dim"].drop("dim2"), ind["new_dim"]) + assert_equal( + actual.drop_vars("new_dim").drop_vars("dim2"), expected.drop_vars("new_dim") + ) + assert_equal(actual["new_dim"].drop_vars("dim2"), ind["new_dim"]) # with conflicted coordinate (silently ignored) ind = DataArray( @@ -1304,10 +1308,12 @@ def test_sel_dataarray(self): coords={"new_dim": ["a", "b", "c"], "dim2": 3}, ) actual = data.sel(dim2=ind) - assert_equal(actual["new_dim"].drop("dim2"), ind["new_dim"].drop("dim2")) + assert_equal( + actual["new_dim"].drop_vars("dim2"), ind["new_dim"].drop_vars("dim2") + ) expected = data.isel(dim2=[0, 1, 2]) expected["dim2"] = (("new_dim"), expected["dim2"].values) - assert_equal(actual["dim2"].drop("new_dim"), expected["dim2"]) + assert_equal(actual["dim2"].drop_vars("new_dim"), expected["dim2"]) assert actual["var1"].dims == ("dim1", "new_dim") # with non-dimensional coordinate @@ -1322,7 +1328,7 @@ def test_sel_dataarray(self): ) actual = data.sel(dim2=ind) expected = data.isel(dim2=[0, 1, 2]) - assert_equal(actual.drop("new_dim"), expected) + assert_equal(actual.drop_vars("new_dim"), expected) assert np.allclose(actual["new_dim"].values, ind["new_dim"].values) def test_sel_dataarray_mindex(self): @@ -1554,8 +1560,8 @@ def test_sel_fancy(self): expected_ary = data["foo"][[0, 1, 2], [0, 2, 1]] actual = data.sel(x=idx_x, y=idx_y) assert_array_equal(expected_ary, actual["foo"]) - assert_identical(actual["a"].drop("x"), idx_x["a"]) - assert_identical(actual["b"].drop("y"), idx_y["b"]) + assert_identical(actual["a"].drop_vars("x"), idx_x["a"]) + assert_identical(actual["b"].drop_vars("y"), idx_y["b"]) with pytest.raises(KeyError): data.sel(x=[2.5], y=[2.0], method="pad", tolerance=1e-3) @@ -2094,36 +2100,50 @@ def test_variable_indexing(self): def test_drop_variables(self): data = create_test_data() - assert_identical(data, data.drop([])) + assert_identical(data, data.drop_vars([])) expected = Dataset({k: data[k] for k in data.variables if k != "time"}) - actual = data.drop("time") + actual = data.drop_vars("time") assert_identical(expected, actual) - actual = data.drop(["time"]) + actual = data.drop_vars(["time"]) assert_identical(expected, actual) with raises_regex(ValueError, "cannot be found"): - data.drop("not_found_here") + data.drop_vars("not_found_here") + + actual = data.drop_vars("not_found_here", errors="ignore") + assert_identical(data, actual) + + actual = data.drop_vars(["not_found_here"], errors="ignore") + assert_identical(data, actual) + + actual = data.drop_vars(["time", "not_found_here"], errors="ignore") + assert_identical(expected, actual) + + # deprecated approach with `drop` works (straight copy paste from above) - actual = data.drop("not_found_here", errors="ignore") + with pytest.warns(PendingDeprecationWarning): + actual = data.drop("not_found_here", errors="ignore") assert_identical(data, actual) - actual = data.drop(["not_found_here"], errors="ignore") + with pytest.warns(PendingDeprecationWarning): + actual = data.drop(["not_found_here"], errors="ignore") assert_identical(data, actual) - actual = data.drop(["time", "not_found_here"], errors="ignore") + with pytest.warns(PendingDeprecationWarning): + actual = data.drop(["time", "not_found_here"], errors="ignore") assert_identical(expected, actual) def test_drop_index_labels(self): data = Dataset({"A": (["x", "y"], np.random.randn(2, 3)), "x": ["a", "b"]}) with pytest.warns(DeprecationWarning): - actual = data.drop(["a"], "x") + actual = data.drop(["a"], dim="x") expected = data.isel(x=[1]) assert_identical(expected, actual) with pytest.warns(DeprecationWarning): - actual = data.drop(["a", "b"], "x") + actual = data.drop(["a", "b"], dim="x") expected = data.isel(x=slice(0, 0)) assert_identical(expected, actual) @@ -2147,30 +2167,30 @@ def test_drop_index_labels(self): # DataArrays as labels are a nasty corner case as they are not # Iterable[Hashable] - DataArray.__iter__ yields scalar DataArrays. - actual = data.drop(DataArray(["a", "b", "c"]), "x", errors="ignore") + actual = data.drop_sel(x=DataArray(["a", "b", "c"]), errors="ignore") expected = data.isel(x=slice(0, 0)) assert_identical(expected, actual) + with pytest.warns(DeprecationWarning): + data.drop(DataArray(["a", "b", "c"]), dim="x", errors="ignore") + assert_identical(expected, actual) with raises_regex(ValueError, "does not have coordinate labels"): - data.drop(1, "y") + data.drop_sel(y=1) def test_drop_labels_by_keyword(self): - # Tests for #2910: Support for a additional `drop()` API. data = Dataset( {"A": (["x", "y"], np.random.randn(2, 6)), "x": ["a", "b"], "y": range(6)} ) # Basic functionality. assert len(data.coords["x"]) == 2 - # In the future, this will break. with pytest.warns(DeprecationWarning): ds1 = data.drop(["a"], dim="x") - ds2 = data.drop(x="a") - ds3 = data.drop(x=["a"]) - ds4 = data.drop(x=["a", "b"]) - ds5 = data.drop(x=["a", "b"], y=range(0, 6, 2)) + ds2 = data.drop_sel(x="a") + ds3 = data.drop_sel(x=["a"]) + ds4 = data.drop_sel(x=["a", "b"]) + ds5 = data.drop_sel(x=["a", "b"], y=range(0, 6, 2)) - # In the future, this will result in different behavior. arr = DataArray(range(3), dims=["c"]) with pytest.warns(FutureWarning): data.drop(arr.coords) @@ -2187,10 +2207,11 @@ def test_drop_labels_by_keyword(self): # Error handling if user tries both approaches. with pytest.raises(ValueError): data.drop(labels=["a"], x="a") - with pytest.raises(ValueError): - data.drop(dim="x", x="a") with pytest.raises(ValueError): data.drop(labels=["a"], dim="x", x="a") + warnings.filterwarnings("ignore", r"\W*drop") + with pytest.raises(ValueError): + data.drop(dim="x", x="a") def test_drop_dims(self): data = xr.Dataset( @@ -2203,15 +2224,15 @@ def test_drop_dims(self): ) actual = data.drop_dims("x") - expected = data.drop(["A", "B", "x"]) + expected = data.drop_vars(["A", "B", "x"]) assert_identical(expected, actual) actual = data.drop_dims("y") - expected = data.drop("A") + expected = data.drop_vars("A") assert_identical(expected, actual) actual = data.drop_dims(["x", "y"]) - expected = data.drop(["A", "B", "x"]) + expected = data.drop_vars(["A", "B", "x"]) assert_identical(expected, actual) with pytest.raises((ValueError, KeyError)): @@ -2230,7 +2251,7 @@ def test_drop_dims(self): actual = data.drop_dims("z", errors="wrong_value") actual = data.drop_dims(["x", "y", "z"], errors="ignore") - expected = data.drop(["A", "B", "x"]) + expected = data.drop_vars(["A", "B", "x"]) assert_identical(expected, actual) def test_copy(self): @@ -2571,7 +2592,7 @@ def test_expand_dims_mixed_int_and_coords(self): original["x"].values * np.ones([4, 3, 3]), coords=dict(d=range(4), e=["l", "m", "n"], a=np.linspace(0, 1, 3)), dims=["d", "e", "a"], - ).drop("d"), + ).drop_vars("d"), "y": xr.DataArray( original["y"].values * np.ones([4, 3, 4, 3]), coords=dict( @@ -2581,7 +2602,7 @@ def test_expand_dims_mixed_int_and_coords(self): a=np.linspace(0, 1, 3), ), dims=["d", "e", "b", "a"], - ).drop("d"), + ).drop_vars("d"), }, coords={"c": np.linspace(0, 1, 5)}, ) @@ -3059,7 +3080,7 @@ def test_setitem_with_coords(self): np.arange(10), dims="dim3", coords={"numbers": ("dim3", np.arange(10))} ) expected = ds.copy() - expected["var3"] = other.drop("numbers") + expected["var3"] = other.drop_vars("numbers") actual = ds.copy() actual["var3"] = other assert_identical(expected, actual) @@ -4504,7 +4525,9 @@ def test_apply(self): actual = data.apply(lambda x: x.mean(keep_attrs=True), keep_attrs=True) assert_identical(expected, actual) - assert_identical(data.apply(lambda x: x, keep_attrs=True), data.drop("time")) + assert_identical( + data.apply(lambda x: x, keep_attrs=True), data.drop_vars("time") + ) def scale(x, multiple=1): return multiple * x @@ -4514,7 +4537,7 @@ def scale(x, multiple=1): assert_identical(actual["numbers"], data["numbers"]) actual = data.apply(np.asarray) - expected = data.drop("time") # time is not used on a data var + expected = data.drop_vars("time") # time is not used on a data var assert_equal(expected, actual) def make_example_math_dataset(self): @@ -4616,7 +4639,7 @@ def test_dataset_math_auto_align(self): assert_identical(expected, actual) actual = ds.isel(y=slice(1)) + ds.isel(y=slice(1, None)) - expected = 2 * ds.drop(ds.y, dim="y") + expected = 2 * ds.drop_sel(y=ds.y) assert_equal(actual, expected) actual = ds + ds[["bar"]] diff --git a/xarray/tests/test_duck_array_ops.py b/xarray/tests/test_duck_array_ops.py index 9df2f167cf2..f678af2fec5 100644 --- a/xarray/tests/test_duck_array_ops.py +++ b/xarray/tests/test_duck_array_ops.py @@ -441,7 +441,8 @@ def test_argmin_max(dim_num, dtype, contains_nan, dask, func, skipna, aggdim): ) expected = getattr(da, func)(dim=aggdim, skipna=skipna) assert_allclose( - actual.drop(list(actual.coords)), expected.drop(list(expected.coords)) + actual.drop_vars(list(actual.coords)), + expected.drop_vars(list(expected.coords)), ) diff --git a/xarray/tests/test_interp.py b/xarray/tests/test_interp.py index b9dc9a71acc..b93325d7eab 100644 --- a/xarray/tests/test_interp.py +++ b/xarray/tests/test_interp.py @@ -553,7 +553,7 @@ def test_datetime_single_string(): actual = da.interp(time="2000-01-01T12:00") expected = xr.DataArray(0.5) - assert_allclose(actual.drop("time"), expected) + assert_allclose(actual.drop_vars("time"), expected) @requires_cftime diff --git a/xarray/tests/test_plot.py b/xarray/tests/test_plot.py index 7deabd46eae..6e283ea01da 100644 --- a/xarray/tests/test_plot.py +++ b/xarray/tests/test_plot.py @@ -1837,7 +1837,11 @@ def test_default_labels(self): assert substring_in_axes(self.darray.name, ax) def test_test_empty_cell(self): - g = self.darray.isel(row=1).drop("row").plot(col="col", hue="hue", col_wrap=2) + g = ( + self.darray.isel(row=1) + .drop_vars("row") + .plot(col="col", hue="hue", col_wrap=2) + ) bottomright = g.axes[-1, -1] assert not bottomright.has_data() assert not bottomright.get_visible() diff --git a/xarray/tests/test_units.py b/xarray/tests/test_units.py index 9d14104bb50..80063f8b4bc 100644 --- a/xarray/tests/test_units.py +++ b/xarray/tests/test_units.py @@ -1093,7 +1093,7 @@ def test_content_manipulation(self, func, dtype): "func", ( pytest.param( - method("drop", labels=np.array([1, 5]), dim="x"), + method("drop_sel", labels=dict(x=np.array([1, 5]))), marks=pytest.mark.xfail( reason="selecting using incompatible units does not raise" ), @@ -1128,9 +1128,9 @@ def test_content_manipulation_with_units(self, func, unit, error, dtype): expected = attach_units( func(strip_units(data_array), **stripped_kwargs), - {"data": quantity.units if func.name == "drop" else unit, "x": x.units}, + {"data": quantity.units if func.name == "drop_sel" else unit, "x": x.units}, ) - if error is not None and func.name == "drop": + if error is not None and func.name == "drop_sel": with pytest.raises(error): func(data_array, **kwargs) else: