-
-
Notifications
You must be signed in to change notification settings - Fork 17.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH/PERF: enable column-wise reductions for EA-backed columns #32867
Changes from 19 commits
a9ca0fa
21aee0d
9f83f6e
a9706e0
07372e3
2d08450
c7f1dae
9e2a780
7f847e8
594d2b0
3d62e68
5b0370e
6e3c287
7088cfc
925d660
852331e
1c9a685
34731f2
a8e61d0
f233109
ec65c57
15ec9b6
2653d02
64e0069
bb0a47b
9323f0e
eb33f86
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8481,6 +8481,22 @@ def _count_level(self, level, axis=0, numeric_only=False): | |
def _reduce( | ||
self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds | ||
): | ||
""" | ||
Reduce DataFrame over axis with given operation. | ||
|
||
Parameters | ||
---------- | ||
op : func | ||
The reducing function to be called on the values. | ||
name : str | ||
The name of the reduction. | ||
axis : int | ||
numeric_only : bool, optional | ||
filter_type : None or "bool" | ||
Set to "bool" for ops that only work on boolean values. | ||
skipna, **kwds : keywords to pass to the `op` function | ||
|
||
""" | ||
|
||
assert filter_type is None or filter_type == "bool", filter_type | ||
|
||
|
@@ -8531,7 +8547,12 @@ def _get_data(axis_matters): | |
raise NotImplementedError(msg) | ||
return data | ||
|
||
if numeric_only is not None and axis in [0, 1]: | ||
# special case for block-wise | ||
if ( | ||
not self._mgr.any_extension_types | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why are we excluding any_extension_types here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It indeed handles EA correctly (that fix was split off from this PR a while ago), but the reason I put this check here: the motivation for this branch (do block-wise when possible because it is faster) is not the case for EAs, as block-wise is actually slower. And secondly, that also ensures there is only a single code path when having EAs. Now, we could also only check for the nullable extension type (nullable int, float, bool etc, so it only checks for the new EAs), and not for all extension types in general (which then also includes things like datetimetz) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jbrockmendel I reverted this to keep the original check (so the same cases will take the block-wise code path as before this PR) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. thanks, ill take a fresh look |
||
and numeric_only is not None | ||
and axis in [0, 1] | ||
): | ||
df = self | ||
if numeric_only is True: | ||
df = _get_data(axis_matters=True) | ||
|
@@ -8559,6 +8580,54 @@ def blk_func(values): | |
out[:] = coerce_to_dtypes(out.values, df.dtypes) | ||
return out | ||
|
||
def array_func(values): | ||
if isinstance(values, ExtensionArray): | ||
return values._reduce(name, skipna=skipna, **kwds) | ||
else: | ||
return op(values, skipna=skipna, **kwds) | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# all other options with axis=0 are done column-array-wise | ||
if axis == 0: | ||
|
||
def _constructor(df, result, index=None): | ||
index = index if index is not None else df.columns | ||
if len(result): | ||
return df._constructor_sliced(result, index=index) | ||
else: | ||
# set correct dtype for empty result | ||
dtype = "bool" if filter_type == "bool" else "float64" | ||
return df._constructor_sliced(result, index=index, dtype=dtype) | ||
|
||
def _reduce_columns(df, op): | ||
result = [op(arr) for arr in df._iter_column_arrays()] | ||
return _constructor(df, result) | ||
|
||
df = self | ||
if numeric_only is True: | ||
df = _get_data(axis_matters=True) | ||
|
||
if numeric_only is not None: | ||
return _reduce_columns(df, array_func) | ||
else: | ||
# need to catch and ignore exceptions when numeric_only=None | ||
try: | ||
return _reduce_columns(df, array_func) | ||
except TypeError: | ||
# if column-wise fails and numeric_only was None, we try | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# again but removing those columns for which it fails | ||
result = [] | ||
indices = [] | ||
for i, arr in enumerate(df._iter_column_arrays()): | ||
try: | ||
res = array_func(arr) | ||
except Exception: | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
pass | ||
else: | ||
result.append(res) | ||
indices.append(i) | ||
|
||
return _constructor(df, result, index=df.columns[indices]) | ||
|
||
if not self._is_homogeneous_type: | ||
# try to avoid self.values call | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This block (using We might want to limit the new code above to |
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typing args a +