Skip to content

Commit

Permalink
BUG: Fix unexpected sort in groupby (pandas-dev#17621)
Browse files Browse the repository at this point in the history
  • Loading branch information
Licht-T authored and alanbato committed Nov 10, 2017
1 parent 38bd7ef commit 8f32f68
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 30 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,7 @@ Groupby/Resample/Rolling
- Bug in ``.rolling(...).apply(...)`` with a ``DataFrame`` with a ``DatetimeIndex``, a ``window`` of a timedelta-convertible and ``min_periods >= 1` (:issue:`15305`)
- Bug in ``DataFrame.groupby`` where index and column keys were not recognized correctly when the number of keys equaled the number of elements on the groupby axis (:issue:`16859`)
- Bug in ``groupby.nunique()`` with ``TimeGrouper`` which cannot handle ``NaT`` correctly (:issue:`17575`)
- Bug in ``DataFrame.groupby`` where a single level selection from a ``MultiIndex`` unexpectedly sorts (:issue:`17537`)

Sparse
^^^^^^
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6631,7 +6631,7 @@ def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None,
return rs

def _agg_by_level(self, name, axis=0, level=0, skipna=True, **kwargs):
grouped = self.groupby(level=level, axis=axis)
grouped = self.groupby(level=level, axis=axis, sort=False)
if hasattr(grouped, name) and skipna:
return getattr(grouped, name)(**kwargs)
axis = self._get_axis_number(axis)
Expand Down
23 changes: 21 additions & 2 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2586,10 +2586,27 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
"""
group_axis = obj._get_axis(axis)

# validate that the passed level is compatible with the passed
# validate that the passed single level is compatible with the passed
# axis of the object
if level is not None:
if not isinstance(group_axis, MultiIndex):
# TODO: These if-block and else-block are almost same.
# MultiIndex instance check is removable, but it seems that there are
# some processes only for non-MultiIndex in else-block,
# eg. `obj.index.name != level`. We have to consider carefully whether
# these are applicable for MultiIndex. Even if these are applicable,
# we need to check if it makes no side effect to subsequent processes
# on the outside of this condition.
# (GH 17621)
if isinstance(group_axis, MultiIndex):
if is_list_like(level) and len(level) == 1:
level = level[0]

if key is None and is_scalar(level):
# Get the level values from group_axis
key = group_axis.get_level_values(level)
level = None

else:
# allow level to be a length-one list-like object
# (e.g., level=[0])
# GH 13901
Expand All @@ -2611,6 +2628,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
raise ValueError('level > 0 or level < -1 only valid with '
' MultiIndex')

# NOTE: `group_axis` and `group_axis.get_level_values(level)`
# are same in this section.
level = None
key = group_axis

Expand Down
47 changes: 28 additions & 19 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1791,18 +1791,20 @@ def aggfun(ser):
agged2 = df.groupby(keys).aggregate(aggfun)
assert len(agged2.columns) + 1 == len(df.columns)

def test_groupby_level(self):
@pytest.mark.parametrize('sort', [True, False])
def test_groupby_level(self, sort):
# GH 17537
frame = self.mframe
deleveled = frame.reset_index()

result0 = frame.groupby(level=0).sum()
result1 = frame.groupby(level=1).sum()
result0 = frame.groupby(level=0, sort=sort).sum()
result1 = frame.groupby(level=1, sort=sort).sum()

expected0 = frame.groupby(deleveled['first'].values).sum()
expected1 = frame.groupby(deleveled['second'].values).sum()
expected0 = frame.groupby(deleveled['first'].values, sort=sort).sum()
expected1 = frame.groupby(deleveled['second'].values, sort=sort).sum()

expected0 = expected0.reindex(frame.index.levels[0])
expected1 = expected1.reindex(frame.index.levels[1])
expected0.index.name = 'first'
expected1.index.name = 'second'

assert result0.index.name == 'first'
assert result1.index.name == 'second'
Expand All @@ -1813,15 +1815,15 @@ def test_groupby_level(self):
assert result1.index.name == frame.index.names[1]

# groupby level name
result0 = frame.groupby(level='first').sum()
result1 = frame.groupby(level='second').sum()
result0 = frame.groupby(level='first', sort=sort).sum()
result1 = frame.groupby(level='second', sort=sort).sum()
assert_frame_equal(result0, expected0)
assert_frame_equal(result1, expected1)

# axis=1

result0 = frame.T.groupby(level=0, axis=1).sum()
result1 = frame.T.groupby(level=1, axis=1).sum()
result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum()
result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum()
assert_frame_equal(result0, expected0.T)
assert_frame_equal(result1, expected1.T)

Expand All @@ -1835,15 +1837,17 @@ def test_groupby_level_index_names(self):
df.groupby(level='exp')
pytest.raises(ValueError, df.groupby, level='foo')

def test_groupby_level_with_nas(self):
@pytest.mark.parametrize('sort', [True, False])
def test_groupby_level_with_nas(self, sort):
# GH 17537
index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
labels=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1,
2, 3]])

# factorizing doesn't confuse things
s = Series(np.arange(8.), index=index)
result = s.groupby(level=0).sum()
expected = Series([22., 6.], index=[1, 0])
result = s.groupby(level=0, sort=sort).sum()
expected = Series([6., 22.], index=[0, 1])
assert_series_equal(result, expected)

index = MultiIndex(levels=[[1, 0], [0, 1, 2, 3]],
Expand All @@ -1852,8 +1856,8 @@ def test_groupby_level_with_nas(self):

# factorizing doesn't confuse things
s = Series(np.arange(8.), index=index)
result = s.groupby(level=0).sum()
expected = Series([18., 6.], index=[1, 0])
result = s.groupby(level=0, sort=sort).sum()
expected = Series([6., 18.], index=[0.0, 1.0])
assert_series_equal(result, expected)

def test_groupby_level_apply(self):
Expand Down Expand Up @@ -1936,9 +1940,14 @@ def test_groupby_complex(self):
result = a.sum(level=0)
assert_series_equal(result, expected)

def test_level_preserve_order(self):
grouped = self.mframe.groupby(level=0)
exp_labels = np.array([0, 0, 0, 1, 1, 2, 2, 3, 3, 3], np.intp)
@pytest.mark.parametrize('sort,labels', [
[True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
[False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]]
])
def test_level_preserve_order(self, sort, labels):
# GH 17537
grouped = self.mframe.groupby(level=0, sort=sort)
exp_labels = np.array(labels, np.intp)
assert_almost_equal(grouped.grouper.labels[0], exp_labels)

def test_grouping_labels(self):
Expand Down
16 changes: 12 additions & 4 deletions pandas/tests/groupby/test_whitelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,12 +174,16 @@ def raw_frame():


@pytest.mark.parametrize(
"op, level, axis, skipna",
"op, level, axis, skipna, sort",
product(AGG_FUNCTIONS,
lrange(2), lrange(2),
[True, False],
[True, False]))
def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna):
def test_regression_whitelist_methods(
raw_frame, op, level,
axis, skipna, sort):
# GH6944
# GH 17537
# explicity test the whitelest methods

if axis == 0:
Expand All @@ -188,15 +192,19 @@ def test_regression_whitelist_methods(raw_frame, op, level, axis, skipna):
frame = raw_frame.T

if op in AGG_FUNCTIONS_WITH_SKIPNA:
grouped = frame.groupby(level=level, axis=axis)
grouped = frame.groupby(level=level, axis=axis, sort=sort)
result = getattr(grouped, op)(skipna=skipna)
expected = getattr(frame, op)(level=level, axis=axis,
skipna=skipna)
if sort:
expected = expected.sort_index(axis=axis, level=level)
tm.assert_frame_equal(result, expected)
else:
grouped = frame.groupby(level=level, axis=axis)
grouped = frame.groupby(level=level, axis=axis, sort=sort)
result = getattr(grouped, op)()
expected = getattr(frame, op)(level=level, axis=axis)
if sort:
expected = expected.sort_index(axis=axis, level=level)
tm.assert_frame_equal(result, expected)


Expand Down
17 changes: 13 additions & 4 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1392,17 +1392,23 @@ def test_count(self):
AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew',
'mad', 'std', 'var', 'sem']

def test_series_group_min_max(self):
@pytest.mark.parametrize('sort', [True, False])
def test_series_group_min_max(self, sort):
# GH 17537
for op, level, skipna in cart_product(self.AGG_FUNCTIONS, lrange(2),
[False, True]):
grouped = self.series.groupby(level=level)
grouped = self.series.groupby(level=level, sort=sort)
aggf = lambda x: getattr(x, op)(skipna=skipna)
# skipna=True
leftside = grouped.agg(aggf)
rightside = getattr(self.series, op)(level=level, skipna=skipna)
if sort:
rightside = rightside.sort_index(level=level)
tm.assert_series_equal(leftside, rightside)

def test_frame_group_ops(self):
@pytest.mark.parametrize('sort', [True, False])
def test_frame_group_ops(self, sort):
# GH 17537
self.frame.iloc[1, [1, 2]] = np.nan
self.frame.iloc[7, [0, 1]] = np.nan

Expand All @@ -1415,7 +1421,7 @@ def test_frame_group_ops(self):
else:
frame = self.frame.T

grouped = frame.groupby(level=level, axis=axis)
grouped = frame.groupby(level=level, axis=axis, sort=sort)

pieces = []

Expand All @@ -1426,6 +1432,9 @@ def aggf(x):
leftside = grouped.agg(aggf)
rightside = getattr(frame, op)(level=level, axis=axis,
skipna=skipna)
if sort:
rightside = rightside.sort_index(level=level, axis=axis)
frame = frame.sort_index(level=level, axis=axis)

# for good measure, groupby detail
level_index = frame._get_axis(axis).levels[level]
Expand Down

0 comments on commit 8f32f68

Please sign in to comment.