diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index c6642c5216262..5193231407f12 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -138,6 +138,47 @@ API changes - Provide a proper ``__name__`` and ``__qualname__`` attributes for generic functions (:issue:`12021`) - ``pd.concat(ignore_index=True)`` now uses ``RangeIndex`` as default (:issue:`12695`) +.. _whatsnew_0181.enhancements.groubynth: + +Index in ``Groupby.nth`` output is now more consistent with ``as_index`` +argument passed in (:issue:`11039`): + +Previous Behavior: + +.. code-block:: ipython + + In [4]: df + Out[4]: + A B + 0 a 1 + 1 b 2 + 2 a 3 + + In [5]: df.groupby('A', as_index=True)['B'].nth(0) + Out[5]: + 0 1 + 1 2 + Name: B, dtype: int64 + + +New Behavior: + +.. code-block:: ipython + + In [7]: df.groupby('A', as_index=True)['B'].nth(0) + Out[7]: + A + a 1 + b 2 + Name: B, dtype: int64 + + In [8]: df.groupby('A', as_index=False)['B'].nth(0) + Out[8]: + 0 1 + 1 2 + Name: B, dtype: int64 + + .. _whatsnew_0181.apply_resample: Using ``.apply`` on groupby resampling @@ -239,7 +280,7 @@ Deprecations Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - +- Performance improvements in ``GroupBy.cumcount`` (:issue:`11039`) - Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 6996254f58f00..d7dbfbf01e9e7 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -653,37 +653,37 @@ def _iterate_slices(self): def transform(self, func, *args, **kwargs): raise AbstractMethodError(self) - def _cumcount_array(self, arr=None, ascending=True): + def _cumcount_array(self, ascending=True): """ - arr is where cumcount gets its values from + Parameters + ---------- + ascending : bool, default True + If False, number in reverse, from length of group - 1 to 0. Note ---- this is currently implementing sort=False (though the default is sort=True) for groupby in general """ - if arr is None: - arr = np.arange(self.grouper._max_groupsize, dtype='int64') - - len_index = len(self._selected_obj.index) - cumcounts = np.zeros(len_index, dtype=arr.dtype) - if not len_index: - return cumcounts + ids, _, ngroups = self.grouper.group_info + sorter = _get_group_index_sorter(ids, ngroups) + ids, count = ids[sorter], len(ids) - indices, values = [], [] - for v in self.indices.values(): - indices.append(v) + if count == 0: + return np.empty(0, dtype=np.int64) - if ascending: - values.append(arr[:len(v)]) - else: - values.append(arr[len(v) - 1::-1]) + run = np.r_[True, ids[:-1] != ids[1:]] + rep = np.diff(np.r_[np.nonzero(run)[0], count]) + out = (~run).cumsum() - indices = np.concatenate(indices) - values = np.concatenate(values) - cumcounts[indices] = values + if ascending: + out -= np.repeat(out[run], rep) + else: + out = np.repeat(out[np.r_[run[1:], True]], rep) - out - return cumcounts + rev = np.empty(count, dtype=np.intp) + rev[sorter] = np.arange(count, dtype=np.intp) + return out[rev].astype(np.int64, copy=False) def _index_with_as_index(self, b): """ @@ -1170,47 +1170,21 @@ def nth(self, n, dropna=None): else: raise TypeError("n needs to be an int or a list/set/tuple of ints") - m = self.grouper._max_groupsize - # filter out values that are outside [-m, m) - pos_nth_values = [i for i in nth_values if i >= 0 and i < m] - neg_nth_values = [i for i in nth_values if i < 0 and i >= -m] - + nth_values = np.array(nth_values, dtype=np.intp) self._set_selection_from_grouper() - if not dropna: # good choice - if not pos_nth_values and not neg_nth_values: - # no valid nth values - return self._selected_obj.loc[[]] - - rng = np.zeros(m, dtype=bool) - for i in pos_nth_values: - rng[i] = True - is_nth = self._cumcount_array(rng) - if neg_nth_values: - rng = np.zeros(m, dtype=bool) - for i in neg_nth_values: - rng[- i - 1] = True - is_nth |= self._cumcount_array(rng, ascending=False) + if not dropna: + mask = np.in1d(self._cumcount_array(), nth_values) | \ + np.in1d(self._cumcount_array(ascending=False) + 1, -nth_values) - result = self._selected_obj[is_nth] + out = self._selected_obj[mask] + if not self.as_index: + return out - # the result index - if self.as_index: - ax = self.obj._info_axis - names = self.grouper.names - if self.obj.ndim == 1: - # this is a pass-thru - pass - elif all([x in ax for x in names]): - indicies = [self.obj[name][is_nth] for name in names] - result.index = MultiIndex.from_arrays( - indicies).set_names(names) - elif self._group_selection is not None: - result.index = self.obj._get_axis(self.axis)[is_nth] - - result = result.sort_index() + ids, _, _ = self.grouper.group_info + out.index = self.grouper.result_index[ids[mask]] - return result + return out.sort_index() if self.sort else out if isinstance(self._selected_obj, DataFrame) and \ dropna not in ['any', 'all']: @@ -1241,8 +1215,8 @@ def nth(self, n, dropna=None): axis=self.axis, level=self.level, sort=self.sort) - sizes = dropped.groupby(grouper).size() - result = dropped.groupby(grouper).nth(n) + grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort) + sizes, result = grb.size(), grb.nth(n) mask = (sizes < max_len).values # set the results which don't meet the criteria @@ -1380,11 +1354,8 @@ def head(self, n=5): 0 1 2 2 5 6 """ - - obj = self._selected_obj - in_head = self._cumcount_array() < n - head = obj[in_head] - return head + mask = self._cumcount_array() < n + return self._selected_obj[mask] @Substitution(name='groupby') @Appender(_doc_template) @@ -1409,12 +1380,8 @@ def tail(self, n=5): 0 a 1 2 b 1 """ - - obj = self._selected_obj - rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64') - in_tail = self._cumcount_array(rng, ascending=False) > -n - tail = obj[in_tail] - return tail + mask = self._cumcount_array(ascending=False) < n + return self._selected_obj[mask] @Appender(GroupBy.__doc__) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 6cf779bad1a41..24e5169f99942 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -167,8 +167,7 @@ def test_first_last_nth(self): self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan self.assertTrue(com.isnull(grouped['B'].first()['foo'])) self.assertTrue(com.isnull(grouped['B'].last()['foo'])) - self.assertTrue(com.isnull(grouped['B'].nth(0)[0]) - ) # not sure what this is testing + self.assertTrue(com.isnull(grouped['B'].nth(0)['foo'])) # v0.14.0 whatsnew df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B']) @@ -221,12 +220,12 @@ def test_nth(self): assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A')) assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A')) - assert_frame_equal(g.nth(2), df.loc[[], ['B']]) + assert_frame_equal(g.nth(2), df.loc[[]].set_index('A')) assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A')) assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A')) - assert_frame_equal(g.nth(-3), df.loc[[], ['B']]) - assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]]) - assert_series_equal(g.B.nth(1), df.B.iloc[[1]]) + assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A')) + assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]]) + assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]]) assert_frame_equal(g[['B']].nth(0), df.ix[[0, 2], ['A', 'B']].set_index('A')) @@ -262,11 +261,11 @@ def test_nth(self): 4: 0.70422799999999997}}).set_index(['color', 'food']) - result = df.groupby(level=0).nth(2) + result = df.groupby(level=0, as_index=False).nth(2) expected = df.iloc[[-1]] assert_frame_equal(result, expected) - result = df.groupby(level=0).nth(3) + result = df.groupby(level=0, as_index=False).nth(3) expected = df.loc[[]] assert_frame_equal(result, expected) @@ -290,8 +289,7 @@ def test_nth(self): # as it keeps the order in the series (and not the group order) # related GH 7287 expected = s.groupby(g, sort=False).first() - expected.index = pd.Index(range(1, 10), name=0) - result = s.groupby(g).nth(0, dropna='all') + result = s.groupby(g, sort=False).nth(0, dropna='all') assert_series_equal(result, expected) # doc example @@ -316,14 +314,14 @@ def test_nth(self): assert_frame_equal( g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A')) assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A')) - assert_frame_equal(g.nth([3, 4]), df.loc[[], ['B']]) + assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A')) business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B') df = DataFrame(1, index=business_dates, columns=['a', 'b']) # get the first, fourth and last two business days for each month - result = df.groupby((df.index.year, df.index.month)).nth([0, 3, -2, -1 - ]) + key = (df.index.year, df.index.month) + result = df.groupby(key, as_index=False).nth([0, 3, -2, -1]) expected_dates = pd.to_datetime( ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1', '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5',