Merge branch 'main' into groupby-shuffle

* main: Accessibility: Add keyboard handling for XArray HTML view (pydata#9412) [pre-commit.ci] pre-commit autoupdate (pydata#9316) [skip-ci] Speed up docs build by limiting toctrees (pydata#9395) fix the failing `pre-commit.ci` runs (pydata#9411) Update benchmarks.yml (pydata#9406) GroupBy(multiple groupers) (pydata#9372) Encode/decode property tests use variables() (pydata#9401)
dcherian · Aug 30, 2024 · 20a8cd9 · 20a8cd9
2 parents 6d9ed1c + d33e4ad
commit 20a8cd9
Show file tree

Hide file tree

Showing 18 changed files with 485 additions and 165 deletions.
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -34,7 +34,7 @@ jobs:
           # add "build" because of https://github.com/airspeed-velocity/asv/issues/1385
           create-args: >-
             asv
-            build
+            python-build
             mamba
 
 

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -13,24 +13,24 @@ repos:
       - id: mixed-line-ending
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: 'v0.5.0'
+    rev: 'v0.6.2'
     hooks:
       - id: ruff
         args: ["--fix", "--show-fixes"]
   # https://github.com/python/black#version-control-integration
   - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.4.2
+    rev: 24.8.0
     hooks:
       - id: black-jupyter
   - repo: https://github.com/keewis/blackdoc
     rev: v0.3.9
     hooks:
       - id: blackdoc
         exclude: "generate_aggregations.py"
-        additional_dependencies: ["black==24.4.2"]
+        additional_dependencies: ["black==24.8.0"]
       - id: blackdoc-autoupdate-black
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.10.1
+    rev: v1.11.2
     hooks:
       - id: mypy
         # Copied from setup.cfg
@@ -41,7 +41,7 @@ repos:
         additional_dependencies: [
             # Type stubs
             types-python-dateutil,
-            types-pkg_resources,
+            types-setuptools,
             types-PyYAML,
             types-pytz,
             typing-extensions>=4.1.0,

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -1,9 +1,9 @@
 version: 2
 
 build:
-  os: ubuntu-22.04
+  os: ubuntu-lts-latest
   tools:
-    python: mambaforge-4.10
+    python: mambaforge-latest
   jobs:
     post_checkout:
       - (git --no-pager log --pretty="tformat:%s" -1 | grep -vqF "[skip-rtd]") || exit 183

diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml
@@ -4,7 +4,7 @@ channels:
   - conda-forge
   - nodefaults
 dependencies:
-  - python=3.10
+  - python=3.12
   - bottleneck
   - cartopy
   - cfgrib
@@ -40,6 +40,7 @@ dependencies:
   - sphinx-design
   - sphinx-inline-tabs
   - sphinx>=5.0
+  - sphinx-remove-toctrees
   - sphinxext-opengraph
   - sphinxext-rediraffe
   - zarr>=2.10

diff --git a/doc/conf.py b/doc/conf.py
@@ -88,6 +88,7 @@
     "sphinxext.rediraffe",
     "sphinx_design",
     "sphinx_inline_tabs",
+    "sphinx_remove_toctrees",
 ]
 
 
@@ -198,6 +199,8 @@
 # The master toctree document.
 master_doc = "index"
 
+remove_from_toctrees = ["generated/*"]
+
 # General information about the project.
 project = "xarray"
 copyright = f"2014-{datetime.datetime.now().year}, xarray Developers"
@@ -244,6 +247,7 @@
     repository_url="https://github.com/pydata/xarray",
     repository_branch="main",
     navigation_with_keys=False,  # pydata/pydata-sphinx-theme#1492
+    navigation_depth=4,
     path_to_docs="doc",
     use_edit_page_button=True,
     use_repository_button=True,

diff --git a/doc/user-guide/groupby.rst b/doc/user-guide/groupby.rst
@@ -81,8 +81,7 @@ You can index out a particular group:
 
     ds.groupby("letters")["b"]
 
-Just like in pandas, creating a GroupBy object is cheap: it does not actually
-split the data until you access particular values.
+To group by multiple variables, see :ref:`this section <groupby.multiple>`.
 
 Binning
 ~~~~~~~
@@ -180,19 +179,6 @@ This last line is roughly equivalent to the following::
         results.append(group - alt.sel(letters=label))
     xr.concat(results, dim='x')
 
-Iterating and Squeezing
-~~~~~~~~~~~~~~~~~~~~~~~
-
-Previously, Xarray defaulted to squeezing out dimensions of size one when iterating over
-a GroupBy object. This behaviour is being removed.
-You can always squeeze explicitly later with the Dataset or DataArray
-:py:meth:`DataArray.squeeze` methods.
-
-.. ipython:: python
-
-    next(iter(arr.groupby("x", squeeze=False)))
-
-
 .. _groupby.multidim:
 
 Multidimensional Grouping
@@ -236,6 +222,8 @@ applying your function, and then unstacking the result:
     stacked = da.stack(gridcell=["ny", "nx"])
     stacked.groupby("gridcell").sum(...).unstack("gridcell")
 
+Alternatively, you can groupby both `lat` and `lon` at the :ref:`same time <groupby.multiple>`.
+
 .. _groupby.groupers:
 
 Grouper Objects
@@ -276,7 +264,8 @@ is identical to
 
     ds.groupby(x=UniqueGrouper())
 
-and
+
+Similarly,
 
 .. code-block:: python
 
@@ -303,3 +292,26 @@ is identical to
     from xarray.groupers import TimeResampler
 
     ds.resample(time=TimeResampler("ME"))
+
+
+.. _groupby.multiple:
+
+Grouping by multiple variables
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Use grouper objects to group by multiple dimensions:
+
+.. ipython:: python
+
+    from xarray.groupers import UniqueGrouper
+
+    da.groupby(lat=UniqueGrouper(), lon=UniqueGrouper()).sum()
+
+
+Different groupers can be combined to construct sophisticated GroupBy operations.
+
+.. ipython:: python
+
+    from xarray.groupers import BinGrouper
+
+    ds.groupby(x=BinGrouper(bins=[5, 15, 25]), letters=UniqueGrouper()).sum()
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -24,6 +24,11 @@ New Features
 ~~~~~~~~~~~~
 - Make chunk manager an option in ``set_options`` (:pull:`9362`).
   By `Tom White <https://github.com/tomwhite>`_.
+- Support for :ref:`grouping by multiple variables <groupby.multiple>`.
+  This is quite new, so please check your results and report bugs.
+  Binary operations after grouping by multiple arrays are not supported yet.
+  (:issue:`1056`, :issue:`9332`, :issue:`324`, :pull:`9372`).
+  By `Deepak Cherian <https://github.com/dcherian>`_.
 - Allow data variable specific ``constant_values`` in the dataset ``pad`` function (:pull:`9353``).
   By `Tiago Sanona <https://github.com/tsanona>`_.
 

diff --git a/properties/test_encode_decode.py b/properties/test_encode_decode.py
@@ -11,42 +11,35 @@
 # isort: split
 
 import hypothesis.extra.numpy as npst
-import hypothesis.strategies as st
+import numpy as np
 from hypothesis import given
 
 import xarray as xr
-
-an_array = npst.arrays(
-    dtype=st.one_of(
-        npst.unsigned_integer_dtypes(), npst.integer_dtypes(), npst.floating_dtypes()
-    ),
-    shape=npst.array_shapes(max_side=3),  # max_side specified for performance
-)
+from xarray.testing.strategies import variables
 
 
 @pytest.mark.slow
-@given(st.data(), an_array)
-def test_CFMask_coder_roundtrip(data, arr) -> None:
-    names = data.draw(
-        st.lists(st.text(), min_size=arr.ndim, max_size=arr.ndim, unique=True).map(
-            tuple
-        )
-    )
-    original = xr.Variable(names, arr)
+@given(original=variables())
+def test_CFMask_coder_roundtrip(original) -> None:
     coder = xr.coding.variables.CFMaskCoder()
     roundtripped = coder.decode(coder.encode(original))
     xr.testing.assert_identical(original, roundtripped)
 
 
+@pytest.mark.xfail
+@pytest.mark.slow
+@given(var=variables(dtype=npst.floating_dtypes()))
+def test_CFMask_coder_decode(var) -> None:
+    var[0] = -99
+    var.attrs["_FillValue"] = -99
+    coder = xr.coding.variables.CFMaskCoder()
+    decoded = coder.decode(var)
+    assert np.isnan(decoded[0])
+
+
 @pytest.mark.slow
-@given(st.data(), an_array)
-def test_CFScaleOffset_coder_roundtrip(data, arr) -> None:
-    names = data.draw(
-        st.lists(st.text(), min_size=arr.ndim, max_size=arr.ndim, unique=True).map(
-            tuple
-        )
-    )
-    original = xr.Variable(names, arr)
+@given(original=variables())
+def test_CFScaleOffset_coder_roundtrip(original) -> None:
     coder = xr.coding.variables.CFScaleOffsetCoder()
     roundtripped = coder.decode(coder.encode(original))
     xr.testing.assert_identical(original, roundtripped)
diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py
@@ -145,7 +145,7 @@ def __sub__(self, other):
 
         if isinstance(other, cftime.datetime):
             raise TypeError("Cannot subtract a cftime.datetime from a time offset.")
-        elif type(other) == type(self):
+        elif type(other) is type(self):
             return type(self)(self.n - other.n)
         else:
             return NotImplemented
@@ -165,7 +165,7 @@ def __radd__(self, other):
         return self.__add__(other)
 
     def __rsub__(self, other):
-        if isinstance(other, BaseCFTimeOffset) and type(self) != type(other):
+        if isinstance(other, BaseCFTimeOffset) and type(self) is not type(other):
             raise TypeError("Cannot subtract cftime offsets of differing types")
         return -self + other
 
@@ -462,7 +462,7 @@ def __sub__(self, other: Self) -> Self:
 
         if isinstance(other, cftime.datetime):
             raise TypeError("Cannot subtract cftime.datetime from offset.")
-        if type(other) == type(self) and other.month == self.month:
+        if type(other) is type(self) and other.month == self.month:
             return type(self)(self.n - other.n, month=self.month)
         return NotImplemented
 
@@ -548,7 +548,7 @@ def __sub__(self, other):
 
         if isinstance(other, cftime.datetime):
             raise TypeError("Cannot subtract cftime.datetime from offset.")
-        elif type(other) == type(self) and other.month == self.month:
+        elif type(other) is type(self) and other.month == self.month:
             return type(self)(self.n - other.n, month=self.month)
         else:
             return NotImplemented

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -6801,27 +6801,22 @@ def groupby(
             groupers = either_dict_or_kwargs(group, groupers, "groupby")  # type: ignore
             group = None
 
-        grouper: Grouper
+        rgroupers: tuple[ResolvedGrouper, ...]
         if group is not None:
             if groupers:
                 raise ValueError(
                     "Providing a combination of `group` and **groupers is not supported."
                 )
-            grouper = UniqueGrouper()
+            rgroupers = (ResolvedGrouper(UniqueGrouper(), group, self),)
         else:
-            if len(groupers) > 1:
-                raise ValueError("grouping by multiple variables is not supported yet.")
             if not groupers:
                 raise ValueError("Either `group` or `**groupers` must be provided.")
-            group, grouper = next(iter(groupers.items()))
-
-        rgrouper = ResolvedGrouper(grouper, group, self)
+            rgroupers = tuple(
+                ResolvedGrouper(grouper, group, self)
+                for group, grouper in groupers.items()
+            )
 
-        return DataArrayGroupBy(
-            self,
-            (rgrouper,),
-            restore_coord_dims=restore_coord_dims,
-        )
+        return DataArrayGroupBy(self, rgroupers, restore_coord_dims=restore_coord_dims)
 
     @_deprecate_positional_args("v2024.07.0")
     def groupby_bins(

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -10397,25 +10397,22 @@ def groupby(
             groupers = either_dict_or_kwargs(group, groupers, "groupby")  # type: ignore
             group = None
 
+        rgroupers: tuple[ResolvedGrouper, ...]
         if group is not None:
             if groupers:
                 raise ValueError(
                     "Providing a combination of `group` and **groupers is not supported."
                 )
-            rgrouper = ResolvedGrouper(UniqueGrouper(), group, self)
+            rgroupers = (ResolvedGrouper(UniqueGrouper(), group, self),)
         else:
-            if len(groupers) > 1:
-                raise ValueError("Grouping by multiple variables is not supported yet.")
-            elif not groupers:
+            if not groupers:
                 raise ValueError("Either `group` or `**groupers` must be provided.")
-            for group, grouper in groupers.items():
-                rgrouper = ResolvedGrouper(grouper, group, self)
+            rgroupers = tuple(
+                ResolvedGrouper(grouper, group, self)
+                for group, grouper in groupers.items()
+            )
 
-        return DatasetGroupBy(
-            self,
-            (rgrouper,),
-            restore_coord_dims=restore_coord_dims,
-        )
+        return DatasetGroupBy(self, rgroupers, restore_coord_dims=restore_coord_dims)
 
     @_deprecate_positional_args("v2024.07.0")
     def groupby_bins(