From 562235dfee4c2e62ad1b4c0c3a6a9f90b30914e5 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 10 May 2021 16:31:58 -0700 Subject: [PATCH] CLN: groupby assorted (#41379) --- pandas/core/groupby/generic.py | 29 ++++++++++++++++------------ pandas/core/groupby/ops.py | 6 +++--- pandas/tests/groupby/test_groupby.py | 23 +++++++++++----------- 3 files changed, 32 insertions(+), 26 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 474fc2f456753..c5d9144893f48 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -89,7 +89,6 @@ MultiIndex, all_indexes_same, ) -import pandas.core.indexes.base as ibase from pandas.core.series import Series from pandas.core.util.numba_ import maybe_use_numba @@ -481,14 +480,13 @@ def _get_index() -> Index: if isinstance(values[0], dict): # GH #823 #24880 index = _get_index() - result: FrameOrSeriesUnion = self._reindex_output( - self.obj._constructor_expanddim(values, index=index) - ) + res_df = self.obj._constructor_expanddim(values, index=index) + res_df = self._reindex_output(res_df) # if self.observed is False, # keep all-NaN rows created while re-indexing - result = result.stack(dropna=self.observed) - result.name = self._selection_name - return result + res_ser = res_df.stack(dropna=self.observed) + res_ser.name = self._selection_name + return res_ser elif isinstance(values[0], (Series, DataFrame)): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: @@ -1019,13 +1017,18 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) # grouper specific aggregations if self.grouper.nkeys > 1: + # test_groupby_as_index_series_scalar gets here with 'not self.as_index' return self._python_agg_general(func, *args, **kwargs) elif args or kwargs: + # test_pass_args_kwargs gets here (with and without as_index) + # can't return early result = self._aggregate_frame(func, *args, **kwargs) elif self.axis == 1: # _aggregate_multiple_funcs does not allow self.axis == 1 + # Note: axis == 1 precludes 'not self.as_index', see __init__ result = self._aggregate_frame(func) + return result else: @@ -1055,7 +1058,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) if not self.as_index: self._insert_inaxis_grouper_inplace(result) - result.index = np.arange(len(result)) + result.index = Index(range(len(result))) return result._convert(datetime=True) @@ -1181,7 +1184,9 @@ def _wrap_applied_output(self, data, keys, values, not_indexed_same=False): if self.as_index: return self.obj._constructor_sliced(values, index=key_index) else: - result = DataFrame(values, index=key_index, columns=[self._selection]) + result = self.obj._constructor( + values, index=key_index, columns=[self._selection] + ) self._insert_inaxis_grouper_inplace(result) return result else: @@ -1664,8 +1669,8 @@ def _wrap_transformed_output( def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: if not self.as_index: - index = np.arange(mgr.shape[1]) - mgr.set_axis(1, ibase.Index(index)) + index = Index(range(mgr.shape[1])) + mgr.set_axis(1, index) result = self.obj._constructor(mgr) self._insert_inaxis_grouper_inplace(result) @@ -1793,7 +1798,7 @@ def nunique(self, dropna: bool = True) -> DataFrame: results.columns.names = obj.columns.names # TODO: do at higher level? if not self.as_index: - results.index = ibase.default_index(len(results)) + results.index = Index(range(len(results))) self._insert_inaxis_grouper_inplace(results) return results diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 46b47bc29d8a6..3045451974ee7 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -889,9 +889,8 @@ def codes_info(self) -> np.ndarray: @final def _get_compressed_codes(self) -> tuple[np.ndarray, np.ndarray]: - all_codes = self.codes - if len(all_codes) > 1: - group_index = get_group_index(all_codes, self.shape, sort=True, xnull=True) + if len(self.groupings) > 1: + group_index = get_group_index(self.codes, self.shape, sort=True, xnull=True) return compress_group_index(group_index, sort=self.sort) ping = self.groupings[0] @@ -1111,6 +1110,7 @@ def groups(self): @property def nkeys(self) -> int: + # still matches len(self.groupings), but we can hard-code return 1 def _get_grouper(self): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e8c60063d7684..4368e57a7da4d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -234,17 +234,18 @@ def f(x, q=None, axis=0): tm.assert_series_equal(trans_result, trans_expected) # DataFrame - df_grouped = tsframe.groupby(lambda x: x.month) - agg_result = df_grouped.agg(np.percentile, 80, axis=0) - apply_result = df_grouped.apply(DataFrame.quantile, 0.8) - expected = df_grouped.quantile(0.8) - tm.assert_frame_equal(apply_result, expected, check_names=False) - tm.assert_frame_equal(agg_result, expected) - - agg_result = df_grouped.agg(f, q=80) - apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) - tm.assert_frame_equal(agg_result, expected) - tm.assert_frame_equal(apply_result, expected, check_names=False) + for as_index in [True, False]: + df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index) + agg_result = df_grouped.agg(np.percentile, 80, axis=0) + apply_result = df_grouped.apply(DataFrame.quantile, 0.8) + expected = df_grouped.quantile(0.8) + tm.assert_frame_equal(apply_result, expected, check_names=False) + tm.assert_frame_equal(agg_result, expected) + + agg_result = df_grouped.agg(f, q=80) + apply_result = df_grouped.apply(DataFrame.quantile, q=0.8) + tm.assert_frame_equal(agg_result, expected) + tm.assert_frame_equal(apply_result, expected, check_names=False) def test_len():