-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add GroupBy.shuffle_to_chunks()
#9320
Changes from all commits
3bc51bd
60d7619
d1429cd
31fc00e
4583853
abd9dd2
6b820aa
0d70656
fafb937
939db9a
a08450e
d0cd218
4edc976
0b42be4
c52734d
8180625
7897c91
7773548
51a7723
cc95513
18f4a40
f489bcf
ead1bb4
75115d0
390863a
a408cb0
7038f37
05a0fb4
b8e7f62
6d9ed1c
20a8cd9
7a99c8f
5e2fdfb
a22c7ed
2d48690
0679d2b
63b3e77
7dc5dd1
bad0744
91e4bd8
0542944
1e4f805
ad502aa
4b0c143
2b2c4ab
f624c8f
888e780
47e5c17
b100fb1
978fad9
d1a3fc1
23b0cac
d533638
d467bc6
231533c
c77d7c5
bccacfe
2d4392a
003e9f2
0f80c81
88bef5d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,7 +57,13 @@ | |
|
||
from xarray.core.dataarray import DataArray | ||
from xarray.core.dataset import Dataset | ||
from xarray.core.types import GroupIndex, GroupIndices, GroupInput, GroupKey | ||
from xarray.core.types import ( | ||
GroupIndex, | ||
GroupIndices, | ||
GroupInput, | ||
GroupKey, | ||
T_Chunks, | ||
) | ||
from xarray.core.utils import Frozen | ||
from xarray.groupers import EncodedGroups, Grouper | ||
|
||
|
@@ -676,6 +682,76 @@ def sizes(self) -> Mapping[Hashable, int]: | |
self._sizes = self._obj.isel({self._group_dim: index}).sizes | ||
return self._sizes | ||
|
||
def shuffle_to_chunks(self, chunks: T_Chunks = None) -> T_Xarray: | ||
""" | ||
Sort or "shuffle" the underlying object. | ||
|
||
"Shuffle" means the object is sorted so that all group members occur sequentially, | ||
in the same chunk. Multiple groups may occur in the same chunk. | ||
This method is particularly useful for chunked arrays (e.g. dask, cubed). | ||
particularly when you need to map a function that requires all members of a group | ||
to be present in a single chunk. For chunked array types, the order of appearance | ||
is not guaranteed, but will depend on the input chunking. | ||
Comment on lines
+689
to
+694
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is a single group limited to a single chunk? Assuming so, if we get one giant chuck, could that present any performance problems? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The other chunks get "auto" reshaped. This is controlled by the https://docs.dask.org/en/latest/generated/dask.array.shuffle.html |
||
|
||
Parameters | ||
---------- | ||
chunks : int, tuple of int, "auto" or mapping of hashable to int or tuple of int, optional | ||
How to adjust chunks along dimensions not present in the array being grouped by. | ||
|
||
Returns | ||
------- | ||
DataArrayGroupBy or DatasetGroupBy | ||
|
||
Examples | ||
-------- | ||
>>> import dask.array | ||
>>> da = xr.DataArray( | ||
... dims="x", | ||
... data=dask.array.arange(10, chunks=3), | ||
... coords={"x": [1, 2, 3, 1, 2, 3, 1, 2, 3, 0]}, | ||
... name="a", | ||
... ) | ||
>>> shuffled = da.groupby("x").shuffle_to_chunks() | ||
>>> shuffled | ||
<xarray.DataArray 'a' (x: 10)> Size: 80B | ||
dask.array<shuffle, shape=(10,), dtype=int64, chunksize=(3,), chunktype=numpy.ndarray> | ||
Coordinates: | ||
* x (x) int64 80B 0 1 1 1 2 2 2 3 3 3 | ||
|
||
>>> shuffled.groupby("x").quantile(q=0.5).compute() | ||
<xarray.DataArray 'a' (x: 4)> Size: 32B | ||
array([9., 3., 4., 5.]) | ||
Coordinates: | ||
quantile float64 8B 0.5 | ||
* x (x) int64 32B 0 1 2 3 | ||
|
||
See Also | ||
-------- | ||
dask.dataframe.DataFrame.shuffle | ||
dask.array.shuffle | ||
""" | ||
self._raise_if_by_is_chunked() | ||
return self._shuffle_obj(chunks) | ||
|
||
def _shuffle_obj(self, chunks: T_Chunks) -> T_Xarray: | ||
from xarray.core.dataarray import DataArray | ||
|
||
was_array = isinstance(self._obj, DataArray) | ||
as_dataset = self._obj._to_temp_dataset() if was_array else self._obj | ||
|
||
for grouper in self.groupers: | ||
if grouper.name not in as_dataset._variables: | ||
as_dataset.coords[grouper.name] = grouper.group | ||
|
||
shuffled = as_dataset._shuffle( | ||
dim=self._group_dim, indices=self.encoded.group_indices, chunks=chunks | ||
) | ||
unstacked: Dataset = self._maybe_unstack(shuffled) | ||
if was_array: | ||
return self._obj._from_temp_dataset(unstacked) | ||
else: | ||
return unstacked # type: ignore[return-value] | ||
|
||
def map( | ||
self, | ||
func: Callable, | ||
|
@@ -896,7 +972,9 @@ def _maybe_unstack(self, obj): | |
# and `inserted_dims` | ||
# if multiple groupers all share the same single dimension, then | ||
# we don't stack/unstack. Do that manually now. | ||
obj = obj.unstack(*self.encoded.unique_coord.dims) | ||
dims_to_unstack = self.encoded.unique_coord.dims | ||
if all(dim in obj.dims for dim in dims_to_unstack): | ||
obj = obj.unstack(*dims_to_unstack) | ||
to_drop = [ | ||
grouper.name | ||
for grouper in self.groupers | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IIUC, chunks will then be different sizes from each other. So when writing to Zarr we'll need to re-chunk? (asking for my clarification, feel free to not respond if it's obvious / respond with a single word :) )
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes but that's a zarr limitation :)