Skip to content

Commit

Permalink
Merge pull request #3145 from jreback/transform
Browse files Browse the repository at this point in the history
PERF: GH2121 groupby transform
  • Loading branch information
jreback committed Mar 25, 2013
2 parents 08672e3 + 2d81b64 commit 1b7f070
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 16 deletions.
8 changes: 5 additions & 3 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,9 +100,6 @@ pandas 0.11.0
the collections.Mapping ABC.
- Allow selection semantics via a string with a datelike index to work in both
Series and DataFrames (GH3070_)
- Improved performance across several core functions by taking memory
ordering of arrays into account. Courtesy of @stephenwlin (GH3130_)


.. ipython:: python
Expand All @@ -116,6 +113,10 @@ pandas 0.11.0
for plots. Based on https://gist.github.com/huyng/816622 (GH3075_).


- Improved performance across several core functions by taking memory
ordering of arrays into account. Courtesy of @stephenwlin (GH3130_)
- Improved performance of groupby transform method (GH2121_)

**API Changes**

- Do not automatically upcast numeric specified dtypes to ``int64`` or
Expand Down Expand Up @@ -234,6 +235,7 @@ pandas 0.11.0
.. _GH622: https://github.com/pydata/pandas/issues/622
.. _GH797: https://github.com/pydata/pandas/issues/797
.. _GH2758: https://github.com/pydata/pandas/issues/2758
.. _GH2121: https://github.com/pydata/pandas/issues/2121
.. _GH2809: https://github.com/pydata/pandas/issues/2809
.. _GH2810: https://github.com/pydata/pandas/issues/2810
.. _GH2837: https://github.com/pydata/pandas/issues/2837
Expand Down
54 changes: 41 additions & 13 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pandas.util.compat import OrderedDict
import pandas.core.algorithms as algos
import pandas.core.common as com
from pandas.core.common import _possibly_downcast_to_dtype
from pandas.core.common import _possibly_downcast_to_dtype, notnull

import pandas.lib as lib
import pandas.algos as _algos
Expand Down Expand Up @@ -75,7 +75,7 @@ def f(self):
def _first_compat(x, axis=0):
def _first(x):
x = np.asarray(x)
x = x[com.notnull(x)]
x = x[notnull(x)]
if len(x) == 0:
return np.nan
return x[0]
Expand All @@ -89,7 +89,7 @@ def _first(x):
def _last_compat(x, axis=0):
def _last(x):
x = np.asarray(x)
x = x[com.notnull(x)]
x = x[notnull(x)]
if len(x) == 0:
return np.nan
return x[-1]
Expand Down Expand Up @@ -421,7 +421,7 @@ def ohlc(self):

def nth(self, n):
def picker(arr):
arr = arr[com.notnull(arr)]
arr = arr[notnull(arr)]
if len(arr) >= n + 1:
return arr.iget(n)
else:
Expand Down Expand Up @@ -1897,19 +1897,46 @@ def transform(self, func, *args, **kwargs):
gen = self.grouper.get_iterator(obj, axis=self.axis)

if isinstance(func, basestring):
wrapper = lambda x: getattr(x, func)(*args, **kwargs)
fast_path = lambda group: getattr(group, func)(*args, **kwargs)
slow_path = lambda group: group.apply(lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis)
else:
wrapper = lambda x: func(x, *args, **kwargs)
fast_path = lambda group: func(group, *args, **kwargs)
slow_path = lambda group: group.apply(lambda x: func(x, *args, **kwargs), axis=self.axis)

path = None
for name, group in gen:
object.__setattr__(group, 'name', name)

try:
res = group.apply(wrapper, axis=self.axis)
except TypeError:
return self._transform_item_by_item(obj, wrapper)
except Exception: # pragma: no cover
res = wrapper(group)
# decide on a fast path
if path is None:

path = slow_path
try:
res = slow_path(group)

# if we make it here, test if we can use the fast path
try:
res_fast = fast_path(group)

# compare that we get the same results
if res.shape == res_fast.shape:
res_r = res.values.ravel()
res_fast_r = res_fast.values.ravel()
mask = notnull(res_r)
if (res_r[mask] == res_fast_r[mask]).all():
path = fast_path

except:
pass
except TypeError:
return self._transform_item_by_item(obj, fast_path)
except Exception: # pragma: no cover
res = fast_path(group)
path = fast_path

else:

res = path(group)

# broadcasting
if isinstance(res, Series):
Expand All @@ -1925,7 +1952,8 @@ def transform(self, func, *args, **kwargs):
concat_index = obj.columns if self.axis == 0 else obj.index
concatenated = concat(applied, join_axes=[concat_index],
axis=self.axis, verify_integrity=False)
return concatenated.reindex_like(obj)
concatenated.sort_index(inplace=True)
return concatenated

def _transform_item_by_item(self, obj, wrapper):
# iterate through columns
Expand Down
38 changes: 38 additions & 0 deletions vb_suite/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,3 +273,41 @@ def f(g):
"""

groupby_sum_booleans = Benchmark("df.groupby('ii').sum()", setup)

#----------------------------------------------------------------------
# Transform testing

setup = common_setup + """
n_dates = 1000
n_securities = 500
n_columns = 3
share_na = 0.1
dates = date_range('1997-12-31', periods=n_dates, freq='B')
dates = Index(map(lambda x: x.year * 10000 + x.month * 100 + x.day, dates))
secid_min = int('10000000', 16)
secid_max = int('F0000000', 16)
step = (secid_max - secid_min) // (n_securities - 1)
security_ids = map(lambda x: hex(x)[2:10].upper(), range(secid_min, secid_max + 1, step))
data_index = MultiIndex(levels=[dates.values, security_ids],
labels=[[i for i in xrange(n_dates) for _ in xrange(n_securities)], range(n_securities) * n_dates],
names=['date', 'security_id'])
n_data = len(data_index)
columns = Index(['factor{}'.format(i) for i in xrange(1, n_columns + 1)])
data = DataFrame(np.random.randn(n_data, n_columns), index=data_index, columns=columns)
step = int(n_data * share_na)
for column_index in xrange(n_columns):
index = column_index
while index < n_data:
data.set_value(data_index[index], columns[column_index], np.nan)
index += step
f_fillna = lambda x: x.fillna(method='pad')
"""

groupby_transform = Benchmark("data.groupby(level='security_id').transform(f_fillna)", setup)

0 comments on commit 1b7f070

Please sign in to comment.