Skip to content

Commit

Permalink
PERF: don't materialize arrays on checking in groupby (#16413)
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback authored May 22, 2017
1 parent 5fe042f commit d5a681b
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 2 deletions.
9 changes: 9 additions & 0 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,11 @@ def setup(self):
self.dates = (np.datetime64('now') + self.offsets)
self.df = DataFrame({'key1': np.random.randint(0, 500, size=self.n), 'key2': np.random.randint(0, 100, size=self.n), 'value1': np.random.randn(self.n), 'value2': np.random.randn(self.n), 'value3': np.random.randn(self.n), 'dates': self.dates, })

N = 1000000
self.draws = pd.Series(np.random.randn(N))
labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))
self.cats = labels.astype('category')

def time_groupby_multi_size(self):
self.df.groupby(['key1', 'key2']).size()

Expand All @@ -377,6 +382,10 @@ def time_groupby_dt_size(self):
def time_groupby_dt_timegrouper_size(self):
self.df.groupby(TimeGrouper(key='dates', freq='M')).size()

def time_groupby_size(self):
self.draws.groupby(self.cats).size()



#----------------------------------------------------------------------
# groupby with a variable value for ngroups
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.20.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ Performance Improvements
- Performance regression fix when indexing with a list-like (:issue:`16285`)
- Performance regression fix for MultiIndexes (:issue:`16319`, :issue:`16346`)
- Improved performance of ``.clip()`` with scalar arguments (:issue:`15400`)

- Improved performance of groupby with categorical groupers (:issue:`16413`)

.. _whatsnew_0202.bug_fixes:

Expand Down
1 change: 0 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2388,7 +2388,6 @@ def get_loc(self, key, method=None, tolerance=None):
if tolerance is not None:
raise ValueError('tolerance argument only valid if using pad, '
'backfill or nearest lookups')
key = _values_from_object(key)
try:
return self._engine.get_loc(key)
except KeyError:
Expand Down

0 comments on commit d5a681b

Please sign in to comment.