-
-
Notifications
You must be signed in to change notification settings - Fork 18k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
REF: ignore_failures in BlockManager.reduce #35881
Changes from all commits
4c5eddd
c632c9f
9e64be3
42649fb
47121dd
1decb3e
57c5dd3
a358463
ffa7ad7
5281ce7
cdcc1a0
b04e023
356cdf7
e29283b
4b52eda
a58fdf0
313280f
cdad23d
028a0b7
3467913
2f10b72
8f2a047
699b96b
e128da8
fd964d9
253c0ea
fbe67f4
23e3a6a
37e2f99
146b322
ccea5a5
c6b4e2c
1165129
11126bc
a5df009
e1c1a5b
d7099fe
97376bf
0a7fa6f
c5de076
6a30fcf
f349ef7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8595,6 +8595,7 @@ def _reduce( | |
cols = self.columns[~dtype_is_dt] | ||
self = self[cols] | ||
|
||
any_object = self.dtypes.apply(is_object_dtype).any() | ||
# TODO: Make other agg func handle axis=None properly GH#21597 | ||
axis = self._get_axis_number(axis) | ||
labels = self._get_agg_axis(axis) | ||
|
@@ -8621,22 +8622,36 @@ def _get_data() -> DataFrame: | |
data = self._get_bool_data() | ||
return data | ||
|
||
if numeric_only is not None: | ||
if numeric_only is not None or ( | ||
numeric_only is None | ||
and axis == 0 | ||
and not any_object | ||
and not self._mgr.any_extension_types | ||
): | ||
# For numeric_only non-None and axis non-None, we know | ||
# which blocks to use and no try/except is needed. | ||
# For numeric_only=None only the case with axis==0 and no object | ||
# dtypes are unambiguous can be handled with BlockManager.reduce | ||
# Case with EAs see GH#35881 | ||
df = self | ||
if numeric_only is True: | ||
df = _get_data() | ||
if axis == 1: | ||
df = df.T | ||
axis = 0 | ||
|
||
ignore_failures = numeric_only is None | ||
|
||
# After possibly _get_data and transposing, we are now in the | ||
# simple case where we can use BlockManager.reduce | ||
res = df._mgr.reduce(blk_func) | ||
out = df._constructor(res).iloc[0].rename(None) | ||
res, indexer = df._mgr.reduce(blk_func, ignore_failures=ignore_failures) | ||
out = df._constructor(res).iloc[0] | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Based on my profiling, this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The getitem being There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Indeed |
||
if out_dtype is not None: | ||
out = out.astype(out_dtype) | ||
if axis == 0 and is_object_dtype(out.dtype): | ||
out[:] = coerce_to_dtypes(out.values, df.dtypes) | ||
# GH#35865 careful to cast explicitly to object | ||
nvs = coerce_to_dtypes(out.values, df.dtypes.iloc[np.sort(indexer)]) | ||
out[:] = np.array(nvs, dtype=object) | ||
return out | ||
|
||
assert numeric_only is None | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is partly the culprit of the slowdown. See also the top post of #33252, which shows that
self.dtypes.apply(..)
is slower than the method that is used a few lines above fordtype_is_dt