From e30498ac869d4fd38bc26e10479b230c2e355f0d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 19:32:02 -0600 Subject: [PATCH] PERF: Correct signature for group_nth / group_object (#19579) --- asv_bench/benchmarks/groupby.py | 16 ++++++++++++++++ doc/source/whatsnew/v0.23.0.txt | 1 + pandas/_libs/groupby.pyx | 10 ++++++++-- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 8aa67d8bc6a6a..61db39528a5fb 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -160,6 +160,22 @@ def time_series_nth(self, df): df[1].groupby(df[0]).nth(0) +class NthObject(object): + + goal_time = 0.2 + + def setup_cache(self): + df = DataFrame(np.random.randint(1, 100, (10000,)), columns=['g']) + df['obj'] = ['a'] * 5000 + ['b'] * 5000 + return df + + def time_nth(self, df): + df.groupby('g').nth(5) + + def time_nth_last(self, df): + df.groupby('g').last() + + class DateAttributes(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index bed0c077c1348..6c4fce35529ad 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -746,6 +746,7 @@ Groupby/Resample/Rolling - Bug in :func:`DataFrame.transform` where particular aggregation functions were being incorrectly cast to match the dtype(s) of the grouped data (:issue:`19200`) - Bug in :func:`DataFrame.groupby` passing the `on=` kwarg, and subsequently using ``.apply()`` (:issue:`17813`) - Bug in :func:`DataFrame.resample().aggregate` not raising a ``KeyError`` when aggregating a non-existent column (:issue:`16766`, :issue:`19566`) +- Fixed a performance regression for ``GroupBy.nth`` and ``GroupBy.last`` with some object columns (:issue:`19283`) Sparse ^^^^^^ diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 9cc15fb6692d9..55de700c9af52 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -36,7 +36,8 @@ def group_nth_object(ndarray[object, ndim=2] out, ndarray[int64_t] counts, ndarray[object, ndim=2] values, ndarray[int64_t] labels, - int64_t rank): + int64_t rank, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -47,6 +48,8 @@ def group_nth_object(ndarray[object, ndim=2] out, ndarray[int64_t, ndim=2] nobs ndarray[object, ndim=2] resx + assert min_count == -1, "'min_count' only used in add and prod" + nobs = np.zeros(( out).shape, dtype=np.int64) resx = np.empty(( out).shape, dtype=object) @@ -80,7 +83,8 @@ def group_nth_object(ndarray[object, ndim=2] out, def group_last_object(ndarray[object, ndim=2] out, ndarray[int64_t] counts, ndarray[object, ndim=2] values, - ndarray[int64_t] labels): + ndarray[int64_t] labels, + Py_ssize_t min_count=-1): """ Only aggregates on axis=0 """ @@ -91,6 +95,8 @@ def group_last_object(ndarray[object, ndim=2] out, ndarray[object, ndim=2] resx ndarray[int64_t, ndim=2] nobs + assert min_count == -1, "'min_count' only used in add and prod" + nobs = np.zeros(( out).shape, dtype=np.int64) resx = np.empty(( out).shape, dtype=object)