diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 94b61d83335883..92d2ee5276e36c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -627,6 +627,7 @@ Groupby/Resample/Rolling - Bug in ``.rolling(...).apply(...)`` with a ``DataFrame`` with a ``DatetimeIndex``, a ``window`` of a timedelta-convertible and ``min_periods >= 1` (:issue:`15305`) - Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`) - Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`) +- Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`) Sparse ^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6fd4f3eeb6b907..4f6fd0828693e1 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6631,7 +6631,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, return rs def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs): - grouped = self.groupby(level=level, axis=axis) + grouped = self.groupby(level=level, axis=axis, sort=False) if hasattr(grouped, name) and skipna: return getattr(grouped, name)(**kwargs) axis = self._get_axis_number(axis) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a62ae40a85941f..2f2056279558d3 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2586,10 +2586,27 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, """ group_axis = obj._get_axis(axis) - # validate that the passed level is compatible with the passed + # validate that the passed single level is compatible with the passed # axis of the object if level is not None: - if not isinstance(group_axis, MultiIndex): + # TODO: These if-block and else-block are almost same. + # MultiIndex instance check is removable, but it seems that there are + # some processes only for non-MultiIndex in else-block, + # eg. `obj.index.name != level`. We have to consider carefully whether + # these are applicable for MultiIndex. Even if these are applicable, + # we need to check if it makes no side effect to subsequent processes + # on the outside of this condition. + # (GH 17621) + if isinstance(group_axis, MultiIndex): + if is_list_like(level) and len(level) == 1: + level = level[0] + + if key is None and is_scalar(level): + # Get the level values from group_axis + key = group_axis.get_level_values(level) + level = None + + else: # allow level to be a length-one list-like object # (e.g., level=[0]) # GH 13901 @@ -2611,6 +2628,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, raise ValueError('level > 0 or level < -1 only valid with ' ' MultiIndex') + # NOTE: `group_axis` and `group_axis.get_level_values(level)` + # are same in this section. level = None key = group_axis diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d91cff436dee2e..47bf837fa62d95 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1791,18 +1791,20 @@ def aggfun(ser): agged2 = df.groupby(keys).aggregate(aggfun) assert len(agged2.columns) + 1 == len(df.columns) - def test_groupby_level(self): + @pytest.mark.parametrize('sort', [True, False]) + def test_groupby_level(self, sort): + # GH 17537 frame = self.mframe deleveled = frame.reset_index() - result0 = frame.groupby(level=0).sum() - result1 = frame.groupby(level=1).sum() + result0 = frame.groupby(level=0, sort=sort).sum() + result1 = frame.groupby(level=1, sort=sort).sum() - expected0 = frame.groupby(deleveled['first'].values).sum() - expected1 = frame.groupby(deleveled['second'].values).sum() + expected0 = frame.groupby(deleveled['first'].values, sort=sort).sum() + expected1 = frame.groupby(deleveled['second'].values, sort=sort).sum() - expected0 = expected0.reindex(frame.index.levels[0]) - expected1 = expected1.reindex(frame.index.levels[1]) + expected0.index.name = 'first' + expected1.index.name = 'second' assert result0.index.name == 'first' assert result1.index.name == 'second' @@ -1813,15 +1815,15 @@ def test_groupby_level(self): assert result1.index.name == frame.index.names[1] # groupby level name - result0 = frame.groupby(level='first').sum() - result1 = frame.groupby(level='second').sum() + result0 = frame.groupby(level='first', sort=sort).sum() + result1 = frame.groupby(level='second', sort=sort).sum() assert_frame_equal(result0, expected0) assert_frame_equal(result1, expected1) # axis=1 - result0 = frame.T.groupby(level=0, axis=1).sum() - result1 = frame.T.groupby(level=1, axis=1).sum() + result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum() + result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum() assert_frame_equal(result0, expected0.T) assert_frame_equal(result1, expected1.T) @@ -1835,15 +1837,17 @@ def test_groupby_level_index_names(self): df.groupby(level='exp') pytest.raises(ValueError, df.groupby, level='foo') - def test_groupby_level_with_nas(self): + @pytest.mark.parametrize('sort', [True, False]) + def test_groupby_level_with_nas(self, sort): + # GH 17537 index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], labels=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]]) # factorizing doesn't confuse things s = Series(np.arange(8.), index=index) - result = s.groupby(level=0).sum() - expected = Series([22., 6.], index=[1, 0]) + result = s.groupby(level=0, sort=sort).sum() + expected = Series([6., 22.], index=[0, 1]) assert_series_equal(result, expected) index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]], @@ -1852,8 +1856,8 @@ def test_groupby_level_with_nas(self): # factorizing doesn't confuse things s = Series(np.arange(8.), index=index) - result = s.groupby(level=0).sum() - expected = Series([18., 6.], index=[1, 0]) + result = s.groupby(level=0, sort=sort).sum() + expected = Series([6., 18.], index=[0.0, 1.0]) assert_series_equal(result, expected) def test_groupby_level_apply(self): @@ -1936,9 +1940,14 @@ def test_groupby_complex(self): result = a.sum(level=0) assert_series_equal(result, expected) - def test_level_preserve_order(self): - grouped = self.mframe.groupby(level=0) - exp_labels = np.array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3], np.intp) + @pytest.mark.parametrize('sort,labels', [ + [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]], + [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]] + ]) + def test_level_preserve_order(self, sort, labels): + # GH 17537 + grouped = self.mframe.groupby(level=0, sort=sort) + exp_labels = np.array(labels, np.intp) assert_almost_equal(grouped.grouper.labels[0], exp_labels) def test_grouping_labels(self): diff --git a/pandas/tests/groupby/test_whitelist.py b/pandas/tests/groupby/test_whitelist.py index 1c5161d2ffb431..259f466316c414 100644 --- a/pandas/tests/groupby/test_whitelist.py +++ b/pandas/tests/groupby/test_whitelist.py @@ -174,12 +174,16 @@ def raw_frame(): @pytest.mark.parametrize( - "op, level, axis, skipna", + "op, level, axis, skipna, sort", product(AGG_FUNCTIONS, lrange(2), lrange(2), + [True, False], [True, False])) -def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna): +def test_regression_whitelist_methods( + raw_frame, op, level, + axis, skipna, sort): # GH6944 + # GH 17537 # explicity test the whitelest methods if axis == 0: @@ -188,15 +192,19 @@ def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna): frame = raw_frame.T if op in AGG_FUNCTIONS_WITH_SKIPNA: - grouped = frame.groupby(level=level, axis=axis) + grouped = frame.groupby(level=level, axis=axis, sort=sort) result = getattr(grouped, op)(skipna=skipna) expected = getattr(frame, op)(level=level, axis=axis, skipna=skipna) + if sort: + expected = expected.sort_index(axis=axis, level=level) tm.assert_frame_equal(result, expected) else: - grouped = frame.groupby(level=level, axis=axis) + grouped = frame.groupby(level=level, axis=axis, sort=sort) result = getattr(grouped, op)() expected = getattr(frame, op)(level=level, axis=axis) + if sort: + expected = expected.sort_index(axis=axis, level=level) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 6976fe162c5d5c..050335988ca417 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1392,17 +1392,23 @@ def test_count(self): AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', 'std', 'var', 'sem'] - def test_series_group_min_max(self): + @pytest.mark.parametrize('sort', [True, False]) + def test_series_group_min_max(self, sort): + # GH 17537 for op, level, skipna in cart_product(self.AGG_FUNCTIONS, lrange(2), [False, True]): - grouped = self.series.groupby(level=level) + grouped = self.series.groupby(level=level, sort=sort) aggf = lambda x: getattr(x, op)(skipna=skipna) # skipna=True leftside = grouped.agg(aggf) rightside = getattr(self.series, op)(level=level, skipna=skipna) + if sort: + rightside = rightside.sort_index(level=level) tm.assert_series_equal(leftside, rightside) - def test_frame_group_ops(self): + @pytest.mark.parametrize('sort', [True, False]) + def test_frame_group_ops(self, sort): + # GH 17537 self.frame.iloc[1, [1, 2]] = np.nan self.frame.iloc[7, [0, 1]] = np.nan @@ -1415,7 +1421,7 @@ def test_frame_group_ops(self): else: frame = self.frame.T - grouped = frame.groupby(level=level, axis=axis) + grouped = frame.groupby(level=level, axis=axis, sort=sort) pieces = [] @@ -1426,6 +1432,9 @@ def aggf(x): leftside = grouped.agg(aggf) rightside = getattr(frame, op)(level=level, axis=axis, skipna=skipna) + if sort: + rightside = rightside.sort_index(level=level, axis=axis) + frame = frame.sort_index(level=level, axis=axis) # for good measure, groupby detail level_index = frame._get_axis(axis).levels[level]