Skip to content

Commit

Permalink
ENH/BUG groupby nth now filters, works with DataFrames
Browse files Browse the repository at this point in the history
  • Loading branch information
hayd committed Mar 7, 2014
1 parent 1bab0a2 commit c444c73
Show file tree
Hide file tree
Showing 4 changed files with 141 additions and 18 deletions.
28 changes: 28 additions & 0 deletions doc/source/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,34 @@ This shows the first or last n rows from each group.
1 0 1 2
5 2 5 6
Taking the nth row of each group
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

To select from a DataFrame or Series the nth item, use the nth method:

.. ipython:: python
DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
g = df.groupby('A')
g.nth(0)
g.nth(1)
g.nth(-1)
If you want to select the nth not-null method, use the dropna kwarg. For a DataFrame this should be either 'any' or 'all' just like you would pass to dropna, for a Series this just needs to be truthy.

.. ipython:: python
g.nth(0, dropna='any')
g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna
g.B.nth(0, dropna=True)
.. warning::

Before 0.14.0 this method existed but did not work correctly on DataFrames. The API has changed so that it filters by default, but the old behaviour (for Series) can be achieved by passing dropna. An alternative is to dropna before doing the groupby.

Enumerate group items
~~~~~~~~~~~~~~~~~~~~~
Expand Down
12 changes: 11 additions & 1 deletion doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ These are out-of-bounds selections
s.index.year

- More consistent behaviour for some groupby methods:
- groupby head and tail now act more like filter rather than an aggregation:
- groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation:

.. ipython:: python

Expand All @@ -78,6 +78,16 @@ These are out-of-bounds selections

g[['B']].head(1)

- groupby ``nth`` now filters by default, with optional dropna argument to ignore
NaN (to replicate the previous behaviour.)

.. ipython:: python

DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
g = df.groupby('A')
g.nth(0) # can also use negative ints

g.nth(0, dropna='any') # similar to old behaviour

- Local variable usage has changed in
:func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query`
Expand Down
86 changes: 75 additions & 11 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,15 +523,75 @@ def ohlc(self):
"""
return self._cython_agg_general('ohlc')

def nth(self, n):
def picker(arr):
arr = arr[notnull(arr)]
if len(arr) >= n + 1:
return arr.iget(n)
def nth(self, n, dropna=None):
"""
Take the nth row from each group.
If dropna, will not show nth non-null row, dropna is either
Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent
to calling dropna(how=dropna) before the groupby.
Examples
--------
>>> DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
>>> g = df.groupby('A')
>>> g.nth(0)
A B
0 1 NaN
2 5 6
>>> g.nth(1)
A B
1 1 4
>>> g.nth(-1)
A B
1 1 4
2 5 6
>>> g.nth(0, dropna='any')
B
A
1 4
5 6
>>> g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna
B
A
1 NaN
5 NaN
"""

if not dropna: # good choice
m = self.grouper._max_groupsize
if n >= m or n < -m:
return self._selected_obj.loc[[]]
rng = np.zeros(m, dtype=bool)
if n >= 0:
rng[n] = True
is_nth = self._cumcount_array(rng)
else:
rng[- n - 1] = True
is_nth = self._cumcount_array(rng, ascending=False)
return self._selected_obj[is_nth]

if (isinstance(self._selected_obj, DataFrame)
and dropna not in ['any', 'all']):
# Note: when agg-ing picker doesn't raise this, just returns NaN
raise ValueError("For a DataFrame groupby, dropna must be "
"either None, 'any' or 'all', "
"(was passed %s)." % (dropna),)

# old behaviour, but with all and any support for DataFrames.

max_len = n if n >= 0 else - 1 - n
def picker(x):
x = x.dropna(how=dropna) # Note: how is ignored if Series
if len(x) <= max_len:
return np.nan
else:
return x.iloc[n]

return self.agg(picker)


def cumcount(self, **kwargs):
"""
Number each item in each group from 0 to the length of that group - 1.
Expand Down Expand Up @@ -579,8 +639,7 @@ def cumcount(self, **kwargs):
ascending = kwargs.pop('ascending', True)

index = self.obj.index
rng = np.arange(self.grouper._max_groupsize, dtype='int64')
cumcounts = self._cumcount_array(rng, ascending=ascending)
cumcounts = self._cumcount_array(ascending=ascending)
return Series(cumcounts, index)

def head(self, n=5):
Expand All @@ -606,8 +665,7 @@ def head(self, n=5):
"""
obj = self._selected_obj
rng = np.arange(self.grouper._max_groupsize, dtype='int64')
in_head = self._cumcount_array(rng) < n
in_head = self._cumcount_array() < n
head = obj[in_head]
return head

Expand Down Expand Up @@ -639,11 +697,17 @@ def tail(self, n=5):
tail = obj[in_tail]
return tail

def _cumcount_array(self, arr, **kwargs):
def _cumcount_array(self, arr=None, **kwargs):
"""
arr is where cumcount gets it's values from
"""
ascending = kwargs.pop('ascending', True)

if arr is None:
arr = np.arange(self.grouper._max_groupsize, dtype='int64')

len_index = len(self.obj.index)
cumcounts = np.zeros(len_index, dtype='int64')
cumcounts = np.empty(len_index, dtype=arr.dtype)
if ascending:
for v in self.indices.values():
cumcounts[v] = arr[:len(v)]
Expand Down
33 changes: 27 additions & 6 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,19 +156,18 @@ def test_first_last_nth(self):
assert_frame_equal(last, expected, check_names=False)

nth = grouped.nth(1)
expected = self.df.ix[[3, 2], ['B', 'C', 'D']]
expected.index = ['bar', 'foo']
expected = self.df.iloc[[2, 3]]
assert_frame_equal(nth, expected, check_names=False)

# it works!
grouped['B'].first()
grouped['B'].last()
grouped['B'].nth(0)

self.df['B'][self.df['A'] == 'foo'] = np.nan
self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
self.assert_(com.isnull(grouped['B'].first()['foo']))
self.assert_(com.isnull(grouped['B'].last()['foo']))
self.assert_(com.isnull(grouped['B'].nth(0)['foo']))
self.assert_(com.isnull(grouped['B'].nth(0)[0])) # not sure what this is testing

def test_first_last_nth_dtypes(self):

Expand All @@ -189,8 +188,7 @@ def test_first_last_nth_dtypes(self):
assert_frame_equal(last, expected, check_names=False)

nth = grouped.nth(1)
expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']]
expected.index = ['bar', 'foo']
expected = df.iloc[[2, 3]]
assert_frame_equal(nth, expected, check_names=False)

# GH 2763, first/last shifting dtypes
Expand All @@ -201,6 +199,29 @@ def test_first_last_nth_dtypes(self):
f = s.groupby(level=0).first()
self.assertEqual(f.dtype, 'int64')

def test_nth(self):
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
g = df.groupby('A')

assert_frame_equal(g.nth(0), df.iloc[[0, 2]])
assert_frame_equal(g.nth(1), df.iloc[[1]])
assert_frame_equal(g.nth(2), df.loc[[]])
assert_frame_equal(g.nth(-1), df.iloc[[1, 2]])
assert_frame_equal(g.nth(-2), df.iloc[[0]])
assert_frame_equal(g.nth(-3), df.loc[[]])
assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]])
assert_series_equal(g.B.nth(1), df.B.iloc[[1]])
assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['B']])

exp = df.set_index('A')
assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])

exp['B'] = np.nan
assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])


def test_grouper_index_types(self):
# related GH5375
# groupby misbehaving when using a Floatlike index
Expand Down

0 comments on commit c444c73

Please sign in to comment.