Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH/BUG groupby nth now filters, works with DataFrames #6569

Merged
merged 2 commits into from
Mar 7, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions doc/source/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -738,6 +738,34 @@ This shows the first or last n rows from each group.
1 0 1 2
5 2 5 6

Taking the nth row of each group
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

To select from a DataFrame or Series the nth item, use the nth method:

.. ipython:: python

DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
g = df.groupby('A')
g.nth(0)

g.nth(1)

g.nth(-1)

If you want to select the nth not-null method, use the dropna kwarg. For a DataFrame this should be either 'any' or 'all' just like you would pass to dropna, for a Series this just needs to be truthy.

.. ipython:: python

g.nth(0, dropna='any')

g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna

g.B.nth(0, dropna=True)

.. warning::

Before 0.14.0 this method existed but did not work correctly on DataFrames. The API has changed so that it filters by default, but the old behaviour (for Series) can be achieved by passing dropna. An alternative is to dropna before doing the groupby.

Enumerate group items
~~~~~~~~~~~~~~~~~~~~~
Expand Down
12 changes: 11 additions & 1 deletion doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ These are out-of-bounds selections
s.index.year

- More consistent behaviour for some groupby methods:
- groupby head and tail now act more like filter rather than an aggregation:
- groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation:

.. ipython:: python

Expand All @@ -78,6 +78,16 @@ These are out-of-bounds selections

g[['B']].head(1)

- groupby ``nth`` now filters by default, with optional dropna argument to ignore
NaN (to replicate the previous behaviour.)

.. ipython:: python

DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
g = df.groupby('A')
g.nth(0) # can also use negative ints

g.nth(0, dropna='any') # similar to old behaviour

- Local variable usage has changed in
:func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query`
Expand Down
86 changes: 75 additions & 11 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,15 +523,75 @@ def ohlc(self):
"""
return self._cython_agg_general('ohlc')

def nth(self, n):
def picker(arr):
arr = arr[notnull(arr)]
if len(arr) >= n + 1:
return arr.iget(n)
def nth(self, n, dropna=None):
"""
Take the nth row from each group.

If dropna, will not show nth non-null row, dropna is either
Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent
to calling dropna(how=dropna) before the groupby.

Examples
--------
>>> DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
>>> g = df.groupby('A')
>>> g.nth(0)
A B
0 1 NaN
2 5 6
>>> g.nth(1)
A B
1 1 4
>>> g.nth(-1)
A B
1 1 4
2 5 6
>>> g.nth(0, dropna='any')
B
A
1 4
5 6
>>> g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna
B
A
1 NaN
5 NaN

"""

if not dropna: # good choice
m = self.grouper._max_groupsize
if n >= m or n < -m:
return self._selected_obj.loc[[]]
rng = np.zeros(m, dtype=bool)
if n >= 0:
rng[n] = True
is_nth = self._cumcount_array(rng)
else:
rng[- n - 1] = True
is_nth = self._cumcount_array(rng, ascending=False)
return self._selected_obj[is_nth]

if (isinstance(self._selected_obj, DataFrame)
and dropna not in ['any', 'all']):
# Note: when agg-ing picker doesn't raise this, just returns NaN
raise ValueError("For a DataFrame groupby, dropna must be "
"either None, 'any' or 'all', "
"(was passed %s)." % (dropna),)

# old behaviour, but with all and any support for DataFrames.

max_len = n if n >= 0 else - 1 - n
def picker(x):
x = x.dropna(how=dropna) # Note: how is ignored if Series
if len(x) <= max_len:
return np.nan
else:
return x.iloc[n]

return self.agg(picker)


def cumcount(self, **kwargs):
"""
Number each item in each group from 0 to the length of that group - 1.
Expand Down Expand Up @@ -579,8 +639,7 @@ def cumcount(self, **kwargs):
ascending = kwargs.pop('ascending', True)

index = self.obj.index
rng = np.arange(self.grouper._max_groupsize, dtype='int64')
cumcounts = self._cumcount_array(rng, ascending=ascending)
cumcounts = self._cumcount_array(ascending=ascending)
return Series(cumcounts, index)

def head(self, n=5):
Expand All @@ -606,8 +665,7 @@ def head(self, n=5):

"""
obj = self._selected_obj
rng = np.arange(self.grouper._max_groupsize, dtype='int64')
in_head = self._cumcount_array(rng) < n
in_head = self._cumcount_array() < n
head = obj[in_head]
return head

Expand Down Expand Up @@ -639,11 +697,17 @@ def tail(self, n=5):
tail = obj[in_tail]
return tail

def _cumcount_array(self, arr, **kwargs):
def _cumcount_array(self, arr=None, **kwargs):
"""
arr is where cumcount gets it's values from
"""
ascending = kwargs.pop('ascending', True)

if arr is None:
arr = np.arange(self.grouper._max_groupsize, dtype='int64')

len_index = len(self.obj.index)
cumcounts = np.zeros(len_index, dtype='int64')
cumcounts = np.empty(len_index, dtype=arr.dtype)
if ascending:
for v in self.indices.values():
cumcounts[v] = arr[:len(v)]
Expand Down
33 changes: 27 additions & 6 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,19 +156,18 @@ def test_first_last_nth(self):
assert_frame_equal(last, expected, check_names=False)

nth = grouped.nth(1)
expected = self.df.ix[[3, 2], ['B', 'C', 'D']]
expected.index = ['bar', 'foo']
expected = self.df.iloc[[2, 3]]
assert_frame_equal(nth, expected, check_names=False)

# it works!
grouped['B'].first()
grouped['B'].last()
grouped['B'].nth(0)

self.df['B'][self.df['A'] == 'foo'] = np.nan
self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
self.assert_(com.isnull(grouped['B'].first()['foo']))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm.....this should have actually raised a SettingWithCopy (as the test suite sets it to raise)...wierd

self.assert_(com.isnull(grouped['B'].last()['foo']))
self.assert_(com.isnull(grouped['B'].nth(0)['foo']))
self.assert_(com.isnull(grouped['B'].nth(0)[0])) # not sure what this is testing

def test_first_last_nth_dtypes(self):

Expand All @@ -189,8 +188,7 @@ def test_first_last_nth_dtypes(self):
assert_frame_equal(last, expected, check_names=False)

nth = grouped.nth(1)
expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']]
expected.index = ['bar', 'foo']
expected = df.iloc[[2, 3]]
assert_frame_equal(nth, expected, check_names=False)

# GH 2763, first/last shifting dtypes
Expand All @@ -201,6 +199,29 @@ def test_first_last_nth_dtypes(self):
f = s.groupby(level=0).first()
self.assertEqual(f.dtype, 'int64')

def test_nth(self):
df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
g = df.groupby('A')

assert_frame_equal(g.nth(0), df.iloc[[0, 2]])
assert_frame_equal(g.nth(1), df.iloc[[1]])
assert_frame_equal(g.nth(2), df.loc[[]])
assert_frame_equal(g.nth(-1), df.iloc[[1, 2]])
assert_frame_equal(g.nth(-2), df.iloc[[0]])
assert_frame_equal(g.nth(-3), df.loc[[]])
assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]])
assert_series_equal(g.B.nth(1), df.B.iloc[[1]])
assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['B']])

exp = df.set_index('A')
assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]])
assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]])

exp['B'] = np.nan
assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]])
assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]])


def test_grouper_index_types(self):
# related GH5375
# groupby misbehaving when using a Floatlike index
Expand Down
16 changes: 16 additions & 0 deletions vb_suite/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,22 @@ def f(g):
groupby_frame_apply = Benchmark("df.groupby(['key', 'key2']).apply(f)", setup,
start_date=datetime(2011, 10, 1))


#----------------------------------------------------------------------
# DataFrame nth

setup = common_setup + """
df = pd.DataFrame(np.random.randint(1, 100, (10000, 2)))
"""

# Not really a fair test as behaviour has changed!
groupby_frame_nth = Benchmark("df.groupby(0).nth(0)", setup,
start_date=datetime(2014, 3, 1))

groupby_series_nth = Benchmark("df[1].groupby(df[0]).nth(0)", setup,
start_date=datetime(2014, 3, 1))


#----------------------------------------------------------------------
# Sum booleans #2692

Expand Down