pandas-dev · behzadnouri · Sep 9, 2015 · jreback · Apr 18, 2016 · behzadnouri
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -138,6 +138,47 @@ API changes
 - Provide a proper ``__name__`` and ``__qualname__`` attributes for generic functions (:issue:`12021`)
 - ``pd.concat(ignore_index=True)`` now uses ``RangeIndex`` as default (:issue:`12695`)
 
+.. _whatsnew_0181.enhancements.groubynth:
+
+Index in ``Groupby.nth`` output is now more consistent with ``as_index``
+argument passed in (:issue:`11039`):
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+    In [4]: df
+    Out[4]:
+       A  B
+    0  a  1
+    1  b  2
+    2  a  3
+
+    In [5]: df.groupby('A', as_index=True)['B'].nth(0)
+    Out[5]:
+    0    1
+    1    2
+    Name: B, dtype: int64
+
+
+New Behavior:
+
+.. code-block:: ipython
+
+    In [7]: df.groupby('A', as_index=True)['B'].nth(0)
+    Out[7]:
+    A
+    a    1
+    b    2
+    Name: B, dtype: int64
+
+    In [8]: df.groupby('A', as_index=False)['B'].nth(0)
+    Out[8]:
+    0    1
+    1    2
+    Name: B, dtype: int64
+
+
 .. _whatsnew_0181.apply_resample:
 
 Using ``.apply`` on groupby resampling
@@ -239,7 +280,7 @@ Deprecations
 
 Performance Improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
-
+- Performance improvements in ``GroupBy.cumcount`` (:issue:`11039`)
 
 
 - Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`)

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -653,37 +653,37 @@ def _iterate_slices(self):
     def transform(self, func, *args, **kwargs):
         raise AbstractMethodError(self)
 
-    def _cumcount_array(self, arr=None, ascending=True):
+    def _cumcount_array(self, ascending=True):
         """
-        arr is where cumcount gets its values from
+        Parameters
+        ----------
+        ascending : bool, default True
+            If False, number in reverse, from length of group - 1 to 0.
 
         Note
         ----
         this is currently implementing sort=False
         (though the default is sort=True) for groupby in general
         """
-        if arr is None:
-            arr = np.arange(self.grouper._max_groupsize, dtype='int64')
-
-        len_index = len(self._selected_obj.index)
-        cumcounts = np.zeros(len_index, dtype=arr.dtype)
-        if not len_index:
-            return cumcounts
+        ids, _, ngroups = self.grouper.group_info
+        sorter = _get_group_index_sorter(ids, ngroups)
+        ids, count = ids[sorter], len(ids)
 
-        indices, values = [], []
-        for v in self.indices.values():
-            indices.append(v)
+        if count == 0:
+            return np.empty(0, dtype=np.int64)
 
-            if ascending:
-                values.append(arr[:len(v)])
-            else:
-                values.append(arr[len(v) - 1::-1])
+        run = np.r_[True, ids[:-1] != ids[1:]]
+        rep = np.diff(np.r_[np.nonzero(run)[0], count])
+        out = (~run).cumsum()
 
-        indices = np.concatenate(indices)
-        values = np.concatenate(values)
-        cumcounts[indices] = values
+        if ascending:
+            out -= np.repeat(out[run], rep)
+        else:
+            out = np.repeat(out[np.r_[run[1:], True]], rep) - out
 
-        return cumcounts
+        rev = np.empty(count, dtype=np.intp)
+        rev[sorter] = np.arange(count, dtype=np.intp)
+        return out[rev].astype(np.int64, copy=False)
 
     def _index_with_as_index(self, b):
         """
@@ -1170,47 +1170,21 @@ def nth(self, n, dropna=None):
         else:
             raise TypeError("n needs to be an int or a list/set/tuple of ints")
 
-        m = self.grouper._max_groupsize
-        # filter out values that are outside [-m, m)
-        pos_nth_values = [i for i in nth_values if i >= 0 and i < m]
-        neg_nth_values = [i for i in nth_values if i < 0 and i >= -m]
-
+        nth_values = np.array(nth_values, dtype=np.intp)
         self._set_selection_from_grouper()
-        if not dropna:  # good choice
-            if not pos_nth_values and not neg_nth_values:
-                # no valid nth values
-                return self._selected_obj.loc[[]]
-
-            rng = np.zeros(m, dtype=bool)
-            for i in pos_nth_values:
-                rng[i] = True
-            is_nth = self._cumcount_array(rng)
 
-            if neg_nth_values:
-                rng = np.zeros(m, dtype=bool)
-                for i in neg_nth_values:
-                    rng[- i - 1] = True
-                is_nth |= self._cumcount_array(rng, ascending=False)
+        if not dropna:
+            mask = np.in1d(self._cumcount_array(), nth_values) | \
+                np.in1d(self._cumcount_array(ascending=False) + 1, -nth_values)
 
-            result = self._selected_obj[is_nth]
+            out = self._selected_obj[mask]
+            if not self.as_index:
+                return out
 
-            # the result index
-            if self.as_index:
-                ax = self.obj._info_axis
-                names = self.grouper.names
-                if self.obj.ndim == 1:
-                    # this is a pass-thru
-                    pass
-                elif all([x in ax for x in names]):
-                    indicies = [self.obj[name][is_nth] for name in names]
-                    result.index = MultiIndex.from_arrays(
-                        indicies).set_names(names)
-                elif self._group_selection is not None:
-                    result.index = self.obj._get_axis(self.axis)[is_nth]
-
-                result = result.sort_index()
+            ids, _, _ = self.grouper.group_info
+            out.index = self.grouper.result_index[ids[mask]]
 
-            return result
+            return out.sort_index() if self.sort else out
 
         if isinstance(self._selected_obj, DataFrame) and \
            dropna not in ['any', 'all']:
@@ -1241,8 +1215,8 @@ def nth(self, n, dropna=None):
                                          axis=self.axis, level=self.level,
                                          sort=self.sort)
 
-        sizes = dropped.groupby(grouper).size()
-        result = dropped.groupby(grouper).nth(n)
+        grb = dropped.groupby(grouper, as_index=self.as_index, sort=self.sort)
+        sizes, result = grb.size(), grb.nth(n)
         mask = (sizes < max_len).values
 
         # set the results which don't meet the criteria
@@ -1380,11 +1354,8 @@ def head(self, n=5):
         0  1  2
         2  5  6
         """
-
-        obj = self._selected_obj
-        in_head = self._cumcount_array() < n
-        head = obj[in_head]
-        return head
+        mask = self._cumcount_array() < n
+        return self._selected_obj[mask]
 
     @Substitution(name='groupby')
     @Appender(_doc_template)
@@ -1409,12 +1380,8 @@ def tail(self, n=5):
         0  a  1
         2  b  1
         """
-
-        obj = self._selected_obj
-        rng = np.arange(0, -self.grouper._max_groupsize, -1, dtype='int64')
-        in_tail = self._cumcount_array(rng, ascending=False) > -n
-        tail = obj[in_tail]
-        return tail
+        mask = self._cumcount_array(ascending=False) < n
+        return self._selected_obj[mask]
 
 
 @Appender(GroupBy.__doc__)

diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -167,8 +167,7 @@ def test_first_last_nth(self):
         self.df.loc[self.df['A'] == 'foo', 'B'] = np.nan
         self.assertTrue(com.isnull(grouped['B'].first()['foo']))
         self.assertTrue(com.isnull(grouped['B'].last()['foo']))
-        self.assertTrue(com.isnull(grouped['B'].nth(0)[0])
-                        )  # not sure what this is testing
+        self.assertTrue(com.isnull(grouped['B'].nth(0)['foo']))
 
         # v0.14.0 whatsnew
         df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=['A', 'B'])
@@ -221,12 +220,12 @@ def test_nth(self):
 
         assert_frame_equal(g.nth(0), df.iloc[[0, 2]].set_index('A'))
         assert_frame_equal(g.nth(1), df.iloc[[1]].set_index('A'))
-        assert_frame_equal(g.nth(2), df.loc[[], ['B']])
+        assert_frame_equal(g.nth(2), df.loc[[]].set_index('A'))
         assert_frame_equal(g.nth(-1), df.iloc[[1, 2]].set_index('A'))
         assert_frame_equal(g.nth(-2), df.iloc[[0]].set_index('A'))
-        assert_frame_equal(g.nth(-3), df.loc[[], ['B']])
-        assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]])
-        assert_series_equal(g.B.nth(1), df.B.iloc[[1]])
+        assert_frame_equal(g.nth(-3), df.loc[[]].set_index('A'))
+        assert_series_equal(g.B.nth(0), df.set_index('A').B.iloc[[0, 2]])
+        assert_series_equal(g.B.nth(1), df.set_index('A').B.iloc[[1]])
         assert_frame_equal(g[['B']].nth(0),
                            df.ix[[0, 2], ['A', 'B']].set_index('A'))
 
@@ -262,11 +261,11 @@ def test_nth(self):
                                 4: 0.70422799999999997}}).set_index(['color',
                                                                      'food'])
 
-        result = df.groupby(level=0).nth(2)
+        result = df.groupby(level=0, as_index=False).nth(2)
         expected = df.iloc[[-1]]
         assert_frame_equal(result, expected)
 
-        result = df.groupby(level=0).nth(3)
+        result = df.groupby(level=0, as_index=False).nth(3)
         expected = df.loc[[]]
         assert_frame_equal(result, expected)
 
@@ -290,8 +289,7 @@ def test_nth(self):
         # as it keeps the order in the series (and not the group order)
         # related GH 7287
         expected = s.groupby(g, sort=False).first()
-        expected.index = pd.Index(range(1, 10), name=0)
-        result = s.groupby(g).nth(0, dropna='all')
+        result = s.groupby(g, sort=False).nth(0, dropna='all')
         assert_series_equal(result, expected)
 
         # doc example
@@ -316,14 +314,14 @@ def test_nth(self):
         assert_frame_equal(
             g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]].set_index('A'))
         assert_frame_equal(g.nth([2]), df.iloc[[2]].set_index('A'))
-        assert_frame_equal(g.nth([3, 4]), df.loc[[], ['B']])
+        assert_frame_equal(g.nth([3, 4]), df.loc[[]].set_index('A'))
 
         business_dates = pd.date_range(start='4/1/2014', end='6/30/2014',
                                        freq='B')
         df = DataFrame(1, index=business_dates, columns=['a', 'b'])
         # get the first, fourth and last two business days for each month
-        result = df.groupby((df.index.year, df.index.month)).nth([0, 3, -2, -1
-                                                                  ])
+        key = (df.index.year, df.index.month)
+        result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
         expected_dates = pd.to_datetime(
             ['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',
              '2014/5/6', '2014/5/29', '2014/5/30', '2014/6/2', '2014/6/5',