Skip to content

Commit

Permalink
BUG: Groupby.nth includes group key inconsistently pandas-dev#12839
Browse files Browse the repository at this point in the history
closes pandas-dev#12839

Author: adneu <aneumann31@gmail.com>

Closes pandas-dev#13316 from adneu/12839 and squashes the following commits:

16f5cd3 [adneu] Name change
ac1851a [adneu] Added docstrings/comments, and new tests.
4d73cbf [adneu] Updated tests
9b75df4 [adneu] BUG: Groupby.nth includes group key inconsistently pandas-dev#12839
  • Loading branch information
adneu authored and nateGeorge committed Aug 15, 2016
1 parent 671ad41 commit 3c4a798
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 11 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.18.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ Bug Fixes

- Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`)
- Bug in ``groupby`` where ``apply`` returns different result depending on whether first result is ``None`` or not (:issue:`12824`)

- Bug in ``groupby(..).nth()`` where the group key is included inconsistently if called after ``.head()/.tail()`` (:issue:`12839`)

- Bug in ``pd.to_numeric`` when ``errors='coerce'`` and input contains non-hashable objects (:issue:`13324`)

Expand Down
35 changes: 26 additions & 9 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def _groupby_function(name, alias, npfunc, numeric_only=True,
@Appender(_doc_template)
@Appender(_local_template)
def f(self):
self._set_selection_from_grouper()
self._set_group_selection()
try:
return self._cython_agg_general(alias, numeric_only=numeric_only)
except AssertionError as e:
Expand Down Expand Up @@ -457,8 +457,21 @@ def _selected_obj(self):
else:
return self.obj[self._selection]

def _set_selection_from_grouper(self):
""" we may need create a selection if we have non-level groupers """
def _reset_group_selection(self):
"""
Clear group based selection. Used for methods needing to return info on
each group regardless of whether a group selection was previously set.
"""
if self._group_selection is not None:
self._group_selection = None
# GH12839 clear cached selection too when changing group selection
self._reset_cache('_selected_obj')

def _set_group_selection(self):
"""
Create group based selection. Used when selection is not passed
directly but instead via a grouper.
"""
grp = self.grouper
if self.as_index and getattr(grp, 'groupings', None) is not None and \
self.obj.ndim > 1:
Expand All @@ -468,6 +481,8 @@ def _set_selection_from_grouper(self):

if len(groupers):
self._group_selection = ax.difference(Index(groupers)).tolist()
# GH12839 clear selected obj cache when group selection changes
self._reset_cache('_selected_obj')

def _set_result_index_ordered(self, result):
# set the result index on the passed values object and
Expand Down Expand Up @@ -511,7 +526,7 @@ def _make_wrapper(self, name):

# need to setup the selection
# as are not passed directly but in the grouper
self._set_selection_from_grouper()
self._set_group_selection()

f = getattr(self._selected_obj, name)
if not isinstance(f, types.MethodType):
Expand Down Expand Up @@ -979,7 +994,7 @@ def mean(self, *args, **kwargs):
except GroupByError:
raise
except Exception: # pragma: no cover
self._set_selection_from_grouper()
self._set_group_selection()
f = lambda x: x.mean(axis=self.axis)
return self._python_agg_general(f)

Expand All @@ -997,7 +1012,7 @@ def median(self):
raise
except Exception: # pragma: no cover

self._set_selection_from_grouper()
self._set_group_selection()

def f(x):
if isinstance(x, np.ndarray):
Expand Down Expand Up @@ -1040,7 +1055,7 @@ def var(self, ddof=1, *args, **kwargs):
if ddof == 1:
return self._cython_agg_general('var')
else:
self._set_selection_from_grouper()
self._set_group_selection()
f = lambda x: x.var(ddof=ddof)
return self._python_agg_general(f)

Expand Down Expand Up @@ -1217,7 +1232,7 @@ def nth(self, n, dropna=None):
raise TypeError("n needs to be an int or a list/set/tuple of ints")

nth_values = np.array(nth_values, dtype=np.intp)
self._set_selection_from_grouper()
self._set_group_selection()

if not dropna:
mask = np.in1d(self._cumcount_array(), nth_values) | \
Expand Down Expand Up @@ -1325,7 +1340,7 @@ def cumcount(self, ascending=True):
dtype: int64
"""

self._set_selection_from_grouper()
self._set_group_selection()

index = self._selected_obj.index
cumcounts = self._cumcount_array(ascending=ascending)
Expand Down Expand Up @@ -1403,6 +1418,7 @@ def head(self, n=5):
0 1 2
2 5 6
"""
self._reset_group_selection()
mask = self._cumcount_array() < n
return self._selected_obj[mask]

Expand All @@ -1429,6 +1445,7 @@ def tail(self, n=5):
0 a 1
2 b 1
"""
self._reset_group_selection()
mask = self._cumcount_array(ascending=False) < n
return self._selected_obj[mask]

Expand Down
31 changes: 30 additions & 1 deletion pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,35 @@ def test_nth_multi_index_as_expected(self):
names=['A', 'B']))
assert_frame_equal(result, expected)

def test_group_selection_cache(self):
# GH 12839 nth, head, and tail should return same result consistently
df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=['A', 'B'])
expected = df.iloc[[0, 2]].set_index('A')

g = df.groupby('A')
result1 = g.head(n=2)
result2 = g.nth(0)
assert_frame_equal(result1, df)
assert_frame_equal(result2, expected)

g = df.groupby('A')
result1 = g.tail(n=2)
result2 = g.nth(0)
assert_frame_equal(result1, df)
assert_frame_equal(result2, expected)

g = df.groupby('A')
result1 = g.nth(0)
result2 = g.head(n=2)
assert_frame_equal(result1, expected)
assert_frame_equal(result2, df)

g = df.groupby('A')
result1 = g.nth(0)
result2 = g.tail(n=2)
assert_frame_equal(result1, expected)
assert_frame_equal(result2, df)

def test_grouper_index_types(self):
# related GH5375
# groupby misbehaving when using a Floatlike index
Expand Down Expand Up @@ -6116,7 +6145,7 @@ def test_cython_transform(self):
# bit a of hack to make sure the cythonized shift
# is equivalent to pre 0.17.1 behavior
if op == 'shift':
gb._set_selection_from_grouper()
gb._set_group_selection()

for (op, args), targop in ops:
if op != 'shift' and 'int' not in gb_target:
Expand Down

0 comments on commit 3c4a798

Please sign in to comment.