From c444c73c23691da76bd09365db78b637a216c6ea Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Fri, 7 Mar 2014 09:44:33 -0800 Subject: [PATCH 1/2] ENH/BUG groupby nth now filters, works with DataFrames --- doc/source/groupby.rst | 28 ++++++++++++ doc/source/v0.14.0.txt | 12 ++++- pandas/core/groupby.py | 86 +++++++++++++++++++++++++++++++----- pandas/tests/test_groupby.py | 33 +++++++++++--- 4 files changed, 141 insertions(+), 18 deletions(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 4fb8061939fbc..b5c15f83bb9d3 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -738,6 +738,34 @@ This shows the first or last n rows from each group. 1 0 1 2 5 2 5 6 +Taking the nth row of each group +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To select from a DataFrame or Series the nth item, use the nth method: + +.. ipython:: python + + DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + g.nth(0) + + g.nth(1) + + g.nth(-1) + +If you want to select the nth not-null method, use the dropna kwarg. For a DataFrame this should be either 'any' or 'all' just like you would pass to dropna, for a Series this just needs to be truthy. + +.. ipython:: python + + g.nth(0, dropna='any') + + g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna + + g.B.nth(0, dropna=True) + +.. warning:: + + Before 0.14.0 this method existed but did not work correctly on DataFrames. The API has changed so that it filters by default, but the old behaviour (for Series) can be achieved by passing dropna. An alternative is to dropna before doing the groupby. Enumerate group items ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 7c6e6a01cd041..345b11e9171e7 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -62,7 +62,7 @@ These are out-of-bounds selections s.index.year - More consistent behaviour for some groupby methods: - - groupby head and tail now act more like filter rather than an aggregation: + - groupby ``head`` and ``tail`` now act more like ``filter`` rather than an aggregation: .. ipython:: python @@ -78,6 +78,16 @@ These are out-of-bounds selections g[['B']].head(1) + - groupby ``nth`` now filters by default, with optional dropna argument to ignore + NaN (to replicate the previous behaviour.) + + .. ipython:: python + + DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + g.nth(0) # can also use negative ints + + g.nth(0, dropna='any') # similar to old behaviour - Local variable usage has changed in :func:`pandas.eval`/:meth:`DataFrame.eval`/:meth:`DataFrame.query` diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 2116beefb633b..031088c4e5672 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -523,15 +523,75 @@ def ohlc(self): """ return self._cython_agg_general('ohlc') - def nth(self, n): - def picker(arr): - arr = arr[notnull(arr)] - if len(arr) >= n + 1: - return arr.iget(n) + def nth(self, n, dropna=None): + """ + Take the nth row from each group. + + If dropna, will not show nth non-null row, dropna is either + Truthy (if a Series) or 'all', 'any' (if a DataFrame); this is equivalent + to calling dropna(how=dropna) before the groupby. + + Examples + -------- + >>> DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + >>> g = df.groupby('A') + >>> g.nth(0) + A B + 0 1 NaN + 2 5 6 + >>> g.nth(1) + A B + 1 1 4 + >>> g.nth(-1) + A B + 1 1 4 + 2 5 6 + >>> g.nth(0, dropna='any') + B + A + 1 4 + 5 6 + >>> g.nth(1, dropna='any') # NaNs denote group exhausted when using dropna + B + A + 1 NaN + 5 NaN + + """ + + if not dropna: # good choice + m = self.grouper._max_groupsize + if n >= m or n < -m: + return self._selected_obj.loc[[]] + rng = np.zeros(m, dtype=bool) + if n >= 0: + rng[n] = True + is_nth = self._cumcount_array(rng) else: + rng[- n - 1] = True + is_nth = self._cumcount_array(rng, ascending=False) + return self._selected_obj[is_nth] + + if (isinstance(self._selected_obj, DataFrame) + and dropna not in ['any', 'all']): + # Note: when agg-ing picker doesn't raise this, just returns NaN + raise ValueError("For a DataFrame groupby, dropna must be " + "either None, 'any' or 'all', " + "(was passed %s)." % (dropna),) + + # old behaviour, but with all and any support for DataFrames. + + max_len = n if n >= 0 else - 1 - n + def picker(x): + x = x.dropna(how=dropna) # Note: how is ignored if Series + if len(x) <= max_len: return np.nan + else: + return x.iloc[n] + return self.agg(picker) + def cumcount(self, **kwargs): """ Number each item in each group from 0 to the length of that group - 1. @@ -579,8 +639,7 @@ def cumcount(self, **kwargs): ascending = kwargs.pop('ascending', True) index = self.obj.index - rng = np.arange(self.grouper._max_groupsize, dtype='int64') - cumcounts = self._cumcount_array(rng, ascending=ascending) + cumcounts = self._cumcount_array(ascending=ascending) return Series(cumcounts, index) def head(self, n=5): @@ -606,8 +665,7 @@ def head(self, n=5): """ obj = self._selected_obj - rng = np.arange(self.grouper._max_groupsize, dtype='int64') - in_head = self._cumcount_array(rng) < n + in_head = self._cumcount_array() < n head = obj[in_head] return head @@ -639,11 +697,17 @@ def tail(self, n=5): tail = obj[in_tail] return tail - def _cumcount_array(self, arr, **kwargs): + def _cumcount_array(self, arr=None, **kwargs): + """ + arr is where cumcount gets it's values from + """ ascending = kwargs.pop('ascending', True) + if arr is None: + arr = np.arange(self.grouper._max_groupsize, dtype='int64') + len_index = len(self.obj.index) - cumcounts = np.zeros(len_index, dtype='int64') + cumcounts = np.empty(len_index, dtype=arr.dtype) if ascending: for v in self.indices.values(): cumcounts[v] = arr[:len(v)] diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index c67a4d65c4c73..8bbc8e6326639 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -156,8 +156,7 @@ def test_first_last_nth(self): assert_frame_equal(last, expected, check_names=False) nth = grouped.nth(1) - expected = self.df.ix[[3, 2], ['B', 'C', 'D']] - expected.index = ['bar', 'foo'] + expected = self.df.iloc[[2, 3]] assert_frame_equal(nth, expected, check_names=False) # it works! @@ -165,10 +164,10 @@ def test_first_last_nth(self): grouped['B'].last() grouped['B'].nth(0) - self.df['B'][self.df['A'] == 'foo'] = np.nan + self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan self.assert_(com.isnull(grouped['B'].first()['foo'])) self.assert_(com.isnull(grouped['B'].last()['foo'])) - self.assert_(com.isnull(grouped['B'].nth(0)['foo'])) + self.assert_(com.isnull(grouped['B'].nth(0)[0])) # not sure what this is testing def test_first_last_nth_dtypes(self): @@ -189,8 +188,7 @@ def test_first_last_nth_dtypes(self): assert_frame_equal(last, expected, check_names=False) nth = grouped.nth(1) - expected = df.ix[[3, 2], ['B', 'C', 'D', 'E', 'F']] - expected.index = ['bar', 'foo'] + expected = df.iloc[[2, 3]] assert_frame_equal(nth, expected, check_names=False) # GH 2763, first/last shifting dtypes @@ -201,6 +199,29 @@ def test_first_last_nth_dtypes(self): f = s.groupby(level=0).first() self.assertEqual(f.dtype, 'int64') + def test_nth(self): + df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) + g = df.groupby('A') + + assert_frame_equal(g.nth(0), df.iloc[[0, 2]]) + assert_frame_equal(g.nth(1), df.iloc[[1]]) + assert_frame_equal(g.nth(2), df.loc[[]]) + assert_frame_equal(g.nth(-1), df.iloc[[1, 2]]) + assert_frame_equal(g.nth(-2), df.iloc[[0]]) + assert_frame_equal(g.nth(-3), df.loc[[]]) + assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]]) + assert_series_equal(g.B.nth(1), df.B.iloc[[1]]) + assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['B']]) + + exp = df.set_index('A') + assert_frame_equal(g.nth(0, dropna='any'), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(-1, dropna='any'), exp.iloc[[1, 2]]) + + exp['B'] = np.nan + assert_frame_equal(g.nth(7, dropna='any'), exp.iloc[[1, 2]]) + assert_frame_equal(g.nth(2, dropna='any'), exp.iloc[[1, 2]]) + + def test_grouper_index_types(self): # related GH5375 # groupby misbehaving when using a Floatlike index From feaca40242e1ddf17ee8463fd972b3c5b3c127b7 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Fri, 7 Mar 2014 11:47:29 -0800 Subject: [PATCH 2/2] TST add vbench for groupby nth --- vb_suite/groupby.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py index 01b44cbd5351c..dc8103b0ceea2 100644 --- a/vb_suite/groupby.py +++ b/vb_suite/groupby.py @@ -269,6 +269,22 @@ def f(g): groupby_frame_apply = Benchmark("df.groupby(['key', 'key2']).apply(f)", setup, start_date=datetime(2011, 10, 1)) + +#---------------------------------------------------------------------- +# DataFrame nth + +setup = common_setup + """ +df = pd.DataFrame(np.random.randint(1, 100, (10000, 2))) +""" + +# Not really a fair test as behaviour has changed! +groupby_frame_nth = Benchmark("df.groupby(0).nth(0)", setup, + start_date=datetime(2014, 3, 1)) + +groupby_series_nth = Benchmark("df[1].groupby(df[0]).nth(0)", setup, + start_date=datetime(2014, 3, 1)) + + #---------------------------------------------------------------------- # Sum booleans #2692