Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: GH2121 groupby transform #3145

Merged
merged 2 commits into from
Mar 25, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,6 @@ pandas 0.11.0
the collections.Mapping ABC.
- Allow selection semantics via a string with a datelike index to work in both
Series and DataFrames (GH3070_)
- Improved performance across several core functions by taking memory
ordering of arrays into account. Courtesy of @stephenwlin (GH3130_)


.. ipython:: python

Expand All @@ -116,6 +113,10 @@ pandas 0.11.0
for plots. Based on https://gist.github.com/huyng/816622 (GH3075_).


- Improved performance across several core functions by taking memory
ordering of arrays into account. Courtesy of @stephenwlin (GH3130_)
- Improved performance of groupby transform method (GH2121_)

**API Changes**

- Do not automatically upcast numeric specified dtypes to ``int64`` or
Expand Down Expand Up @@ -234,6 +235,7 @@ pandas 0.11.0
.. _GH622: https://github.com/pydata/pandas/issues/622
.. _GH797: https://github.com/pydata/pandas/issues/797
.. _GH2758: https://github.com/pydata/pandas/issues/2758
.. _GH2121: https://github.com/pydata/pandas/issues/2121
.. _GH2809: https://github.com/pydata/pandas/issues/2809
.. _GH2810: https://github.com/pydata/pandas/issues/2810
.. _GH2837: https://github.com/pydata/pandas/issues/2837
Expand Down
54 changes: 41 additions & 13 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pandas.util.compat import OrderedDict
import pandas.core.algorithms as algos
import pandas.core.common as com
from pandas.core.common import _possibly_downcast_to_dtype
from pandas.core.common import _possibly_downcast_to_dtype, notnull

import pandas.lib as lib
import pandas.algos as _algos
Expand Down Expand Up @@ -75,7 +75,7 @@ def f(self):
def _first_compat(x, axis=0):
def _first(x):
x = np.asarray(x)
x = x[com.notnull(x)]
x = x[notnull(x)]
if len(x) == 0:
return np.nan
return x[0]
Expand All @@ -89,7 +89,7 @@ def _first(x):
def _last_compat(x, axis=0):
def _last(x):
x = np.asarray(x)
x = x[com.notnull(x)]
x = x[notnull(x)]
if len(x) == 0:
return np.nan
return x[-1]
Expand Down Expand Up @@ -421,7 +421,7 @@ def ohlc(self):

def nth(self, n):
def picker(arr):
arr = arr[com.notnull(arr)]
arr = arr[notnull(arr)]
if len(arr) >= n + 1:
return arr.iget(n)
else:
Expand Down Expand Up @@ -1897,19 +1897,46 @@ def transform(self, func, *args, **kwargs):
gen = self.grouper.get_iterator(obj, axis=self.axis)

if isinstance(func, basestring):
wrapper = lambda x: getattr(x, func)(*args, **kwargs)
fast_path = lambda group: getattr(group, func)(*args, **kwargs)
slow_path = lambda group: group.apply(lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis)
else:
wrapper = lambda x: func(x, *args, **kwargs)
fast_path = lambda group: func(group, *args, **kwargs)
slow_path = lambda group: group.apply(lambda x: func(x, *args, **kwargs), axis=self.axis)

path = None
for name, group in gen:
object.__setattr__(group, 'name', name)

try:
res = group.apply(wrapper, axis=self.axis)
except TypeError:
return self._transform_item_by_item(obj, wrapper)
except Exception: # pragma: no cover
res = wrapper(group)
# decide on a fast path
if path is None:

path = slow_path
try:
res = slow_path(group)

# if we make it here, test if we can use the fast path
try:
res_fast = fast_path(group)

# compare that we get the same results
if res.shape == res_fast.shape:
res_r = res.values.ravel()
res_fast_r = res_fast.values.ravel()
mask = notnull(res_r)
if (res_r[mask] == res_fast_r[mask]).all():
path = fast_path

except:
pass
except TypeError:
return self._transform_item_by_item(obj, fast_path)
except Exception: # pragma: no cover
res = fast_path(group)
path = fast_path

else:

res = path(group)

# broadcasting
if isinstance(res, Series):
Expand All @@ -1925,7 +1952,8 @@ def transform(self, func, *args, **kwargs):
concat_index = obj.columns if self.axis == 0 else obj.index
concatenated = concat(applied, join_axes=[concat_index],
axis=self.axis, verify_integrity=False)
return concatenated.reindex_like(obj)
concatenated.sort_index(inplace=True)
return concatenated

def _transform_item_by_item(self, obj, wrapper):
# iterate through columns
Expand Down
38 changes: 38 additions & 0 deletions vb_suite/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,41 @@ def f(g):
"""

groupby_sum_booleans = Benchmark("df.groupby('ii').sum()", setup)

#----------------------------------------------------------------------
# Transform testing

setup = common_setup + """
n_dates = 1000
n_securities = 500
n_columns = 3
share_na = 0.1

dates = date_range('1997-12-31', periods=n_dates, freq='B')
dates = Index(map(lambda x: x.year * 10000 + x.month * 100 + x.day, dates))

secid_min = int('10000000', 16)
secid_max = int('F0000000', 16)
step = (secid_max - secid_min) // (n_securities - 1)
security_ids = map(lambda x: hex(x)[2:10].upper(), range(secid_min, secid_max + 1, step))

data_index = MultiIndex(levels=[dates.values, security_ids],
labels=[[i for i in xrange(n_dates) for _ in xrange(n_securities)], range(n_securities) * n_dates],
names=['date', 'security_id'])
n_data = len(data_index)

columns = Index(['factor{}'.format(i) for i in xrange(1, n_columns + 1)])

data = DataFrame(np.random.randn(n_data, n_columns), index=data_index, columns=columns)

step = int(n_data * share_na)
for column_index in xrange(n_columns):
index = column_index
while index < n_data:
data.set_value(data_index[index], columns[column_index], np.nan)
index += step

f_fillna = lambda x: x.fillna(method='pad')
"""

groupby_transform = Benchmark("data.groupby(level='security_id').transform(f_fillna)", setup)