Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Mixed DataFrame with Extension Array incorrect aggregation #35112

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion pandas/_libs/tslibs/timedeltas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -547,7 +547,9 @@ def _binary_op_method_timedeltalike(op, name):

try:
other = Timedelta(other)
except ValueError:
except (ValueError, SystemError):
# catch SystemError to workaround NumPy issue
# https://github.com/numpy/numpy/issues/15502
# failed to parse as timedelta
return NotImplemented

Expand Down
4 changes: 2 additions & 2 deletions pandas/_libs/tslibs/timestamps.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -205,10 +205,10 @@ def integer_op_not_supported(obj):

# GH#30886 using an fstring raises SystemError
int_addsub_msg = (
f"Addition/subtraction of integers and integer-arrays with {cls} is "
"Addition/subtraction of integers and integer-arrays with {cls} is "
"no longer supported. Instead of adding/subtracting `n`, "
"use `n * obj.freq`"
)
).format(cls=cls)
return TypeError(int_addsub_msg)


Expand Down
49 changes: 20 additions & 29 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -8545,51 +8545,42 @@ def blk_func(values):
out[:] = coerce_to_dtypes(out.values, df.dtypes)
return out

if not self._is_homogeneous_type:
# try to avoid self.values call

if filter_type is None and axis == 0 and len(self) > 0:
# operate column-wise

# numeric_only must be None here, as other cases caught above
# require len(self) > 0 bc frame_apply messes up empty prod/sum

# this can end up with a non-reduction
# but not always. if the types are mixed
# with datelike then need to make sure a series

# we only end up here if we have not specified
# numeric_only and yet we have tried a
# column-by-column reduction, where we have mixed type.
# So let's just do what we can
from pandas.core.apply import frame_apply

opa = frame_apply(
self, func=f, result_type="expand", ignore_failures=True
)
result = opa.get_result()
if result.ndim == self.ndim:
result = result.iloc[0].rename(None)
return result

if numeric_only is None:
data = self
values = data.values

try:
result = f(values)

except TypeError:
# e.g. in nanops trying to convert strs to float

# try by-column first
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why does this need to be moved? if it is moved, then the "try by-column first" comment is no longer accurate

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the "try by-column first" comment is no longer accurate

This comment was there originally before the move in https://github.com/pandas-dev/pandas/pull/32950/files.

why does this need to be moved?

This PR reverts a change that caused a regression. The PR that caused the regression is labelled as a clean. This PR is in response to #34730 (comment).

if filter_type is None and axis == 0:
# this can end up with a non-reduction
# but not always. if the types are mixed
# with datelike then need to make sure a series

# we only end up here if we have not specified
# numeric_only and yet we have tried a
# column-by-column reduction, where we have mixed type.
# So let's just do what we can
from pandas.core.apply import frame_apply

opa = frame_apply(
self, func=f, result_type="expand", ignore_failures=True
)
result = opa.get_result()
if result.ndim == self.ndim:
result = result.iloc[0]
return result

# TODO: why doesnt axis matter here?
data = _get_data(axis_matters=False)
labels = data._get_agg_axis(axis)

values = data.values
with np.errstate(all="ignore"):
result = f(values)

else:
if numeric_only:
data = _get_data(axis_matters=True)
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/arrays/integer/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,15 @@ def test_integer_array_numpy_sum(values, expected):
assert result == expected


def test_mixed_frame_with_integer_sum():
# https://github.com/pandas-dev/pandas/issues/34520
df = pd.DataFrame([["a", 1]], columns=list("ab"))
df = df.astype({"b": "Int64"})
result = df.sum()
expected = pd.Series(["a", 1], index=["a", "b"])
tm.assert_series_equal(result, expected)


# TODO(jreback) - these need testing / are broken

# shift
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,9 +351,7 @@ def kurt(x):
"sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum
)
assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True)
assert_stat_op_calc(
"product", np.prod, float_frame_with_na, skipna_alternative=np.nanprod
)
assert_stat_op_calc("product", np.prod, float_frame_with_na)

assert_stat_op_calc("mad", mad, float_frame_with_na)
assert_stat_op_calc("var", var, float_frame_with_na)
Expand Down