pytroll · djhoese · May 6, 2022 · Jun 5, 2021 · Jun 5, 2021 · Jun 6, 2021
diff --git a/pyresample/bucket/__init__.py b/pyresample/bucket/__init__.py
@@ -190,11 +190,8 @@ def _mask_bins_with_nan_if_not_skipna(self, skipna, data, out_size, statistic):
             statistic = da.where(nan_bins > 0, np.nan, statistic)
         return statistic
 
-    def _call_pandas_groupby_statistics(self, scipy_method, data, fill_value=None, skipna=None):
+    def _call_bin_statistic(self, statistic_method, data, fill_value=None, skipna=None):
         """Calculate statistics (min/max) for each bin with drop-in-a-bucket resampling."""
-        import dask.dataframe as dd
-        import pandas as pd
-
         if isinstance(data, xr.DataArray):
             data = data.data
         data = data.ravel()
@@ -209,27 +206,50 @@ def _call_pandas_groupby_statistics(self, scipy_method, data, fill_value=None, s
         # Calculate the min of the data falling to each bin
         out_size = self.target_area.size
 
-        # merge into one Dataframe
-        df = dd.concat([dd.from_dask_array(self.idxs), dd.from_dask_array(weights)],
-                       axis=1)
-        df.columns = ['x', 'values']
-
-        if scipy_method == 'min':
-            statistics = df.map_partitions(lambda part: part.groupby(
-                                           np.digitize(part.x,
-                                                       bins=np.linspace(0, out_size, out_size)
-                                                       )
-                                           )['values'].min())
-
-        elif scipy_method == 'max':
-            statistics = df.map_partitions(lambda part: part.groupby(
-                                           np.digitize(part.x,
-                                                       bins=np.linspace(0, out_size, out_size)
-                                                       )
-                                           )['values'].max())
-
-        # fill missed index
-        statistics = (statistics + pd.Series(np.zeros(out_size))).fillna(0)
+        def numpy_reduceat(data, bins, statistic_method):
+            '''Calculate the bin_statistic using numpy.ufunc.reduceat'''
+            if statistic_method == 'min':
+                return np.minimum.reduceat(data, bins)
+            elif statistic_method == 'max':
+                return np.maximum.reduceat(data, bins)
+        # create the output bins
+        bins = da.linspace(0, out_size-1, out_size).astype('int')
+
+        # get the indices of the bins to which each value in self.idxs belongs
+        slices = da.digitize(self.idxs, bins)
+
+        # convert to DataArray using idxs as coords
+        weights = xr.DataArray(weights, dims=['x'])
+        slices = xr.DataArray(slices, dims=['x'])
+
+        # set out of range value to nan
+        mask = xr.DataArray((self.idxs >= bins.min()) & (self.idxs <= bins.max()), dims=['x'])
+        weights = weights.where(mask, drop=True)
+        slices = slices.where(mask, drop=True)
+
+        # sort the slices
+        sort_index = da.map_blocks(np.argsort, slices.data)
+        slices = slices[sort_index]
+        weights = weights[sort_index]
+
+        # get the unique slices (for assignment later) and bins (for numpy_reduceat)
+        unique_slices, unique_bins = da.unique(slices.data, return_index=True)
+        statistics_sub = xr.apply_ufunc(numpy_reduceat,
+                                        weights,
+                                        unique_bins.compute_chunk_sizes(),
+                                        kwargs={'statistic_method': statistic_method},
+                                        input_core_dims=[['x'], ['new_x']],
+                                        exclude_dims=set(('x',)),
+                                        output_core_dims=[['new_x'], ],
+                                        dask="parallelized",
+                                        output_dtypes=[weights.dtype],
+                                        dask_gufunc_kwargs={'allow_rechunk': True},
+                                        )
+
+        # initialize the output DataArray with np.nan
+        statistics = xr.DataArray(da.from_array(np.full((out_size), np.nan)), dims=['x'])
+        # assign the binned statistics
+        statistics.loc[unique_slices.astype('int')-1] = statistics_sub
 
         counts = self.get_sum(np.logical_not(np.isnan(data)).astype(int)).ravel()
 
@@ -244,9 +264,9 @@ def _call_pandas_groupby_statistics(self, scipy_method, data, fill_value=None, s
     def get_min(self, data, fill_value=np.nan, skipna=True):
         """Calculate minimums for each bin with drop-in-a-bucket resampling.
 
-        .. warning::
+        .. note::
 
-            The slow :meth:`pandas.DataFrame.groupby` method is temporarily used here,
+            The :meth:`numpy.ufunc.reduceat` method is used here,
             as the `dask_groupby <https://github.com/dcherian/dask_groupby>`_ is still under development.
 
         Parameters
@@ -267,14 +287,14 @@ def get_min(self, data, fill_value=np.nan, skipna=True):
             Bin-wise minimums in the target grid
         """
         LOG.info("Get min of values in each location")
-        return self._call_pandas_groupby_statistics('min', data, fill_value, skipna)
+        return self._call_bin_statistic('min', data, fill_value, skipna)
 
     def get_max(self, data, fill_value=np.nan, skipna=True):
         """Calculate maximums for each bin with drop-in-a-bucket resampling.
 
-        .. warning::
+        .. note::
 
-            The slow :meth:`pandas.DataFrame.groupby` method is temporarily used here,
+            The :meth:`numpy.ufunc.reduceat`  method is temporarily used here,
             as the `dask_groupby <https://github.com/dcherian/dask_groupby>`_ is still under development.
 
         Parameters
@@ -295,7 +315,7 @@ def get_max(self, data, fill_value=np.nan, skipna=True):
             Bin-wise maximums in the target grid
         """
         LOG.info("Get max of values in each location")
-        return self._call_pandas_groupby_statistics('max', data, fill_value, skipna)
+        return self._call_bin_statistic('max', data, fill_value, skipna)
 
     def get_count(self):
         """Count the number of occurrences for each bin using drop-in-a-bucket