Skip to content

Commit

Permalink
BUG: DataFrameGroupBy with numeric_only and empty non-numeric data (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel committed Jun 2, 2021
1 parent 8caf370 commit 6b94e24
Show file tree
Hide file tree
Showing 9 changed files with 95 additions and 44 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1061,6 +1061,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`)
- Bug in :meth:`Resampler.apply` with non-unique columns incorrectly dropping duplicated columns (:issue:`41445`)
- Bug in :meth:`SeriesGroupBy` aggregations incorrectly returning empty :class:`Series` instead of raising ``TypeError`` on aggregations that are invalid for its dtype, e.g. ``.prod`` with ``datetime64[ns]`` dtype (:issue:`41342`)
- Bug in :class:`DataFrameGroupBy` aggregations incorrectly failing to drop columns with invalid dtypes for that aggregation when there are no valid columns (:issue:`41291`)
- Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`)
- Bug in :meth:`DataFrameGroupBy.transform` and :meth:`DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`)

Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ def group_add(add_t[:, ::1] out,
val = values[i, j]

# not nan
if val == val:
if not checknull(val):
nobs[lab, j] += 1

if nobs[lab, j] == 1:
Expand Down
17 changes: 3 additions & 14 deletions pandas/core/groupby/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,7 @@
validate_func_kwargs,
)
from pandas.core.apply import GroupByApply
from pandas.core.base import (
DataError,
SpecificationError,
)
from pandas.core.base import SpecificationError
import pandas.core.common as com
from pandas.core.construction import create_series_with_explicit_dtype
from pandas.core.frame import DataFrame
Expand Down Expand Up @@ -516,16 +513,12 @@ def _cython_transform(

obj = self._selected_obj

is_numeric = is_numeric_dtype(obj.dtype)
if numeric_only and not is_numeric:
raise DataError("No numeric types to aggregate")

try:
result = self.grouper._cython_operation(
"transform", obj._values, how, axis, **kwargs
)
except (NotImplementedError, TypeError):
raise DataError("No numeric types to aggregate")
except NotImplementedError as err:
raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err

return obj._constructor(result, index=self.obj.index, name=obj.name)

Expand Down Expand Up @@ -1064,7 +1057,6 @@ def _cython_agg_general(
# Note: we never get here with how="ohlc"; that goes through SeriesGroupBy

data: Manager2D = self._get_data_to_aggregate()
orig = data

if numeric_only:
data = data.get_numeric_data(copy=False)
Expand All @@ -1087,9 +1079,6 @@ def array_func(values: ArrayLike) -> ArrayLike:
# continue and exclude the block
new_mgr = data.grouped_reduce(array_func, ignore_failures=True)

if not len(new_mgr) and len(orig):
# If the original Manager was already empty, no need to raise
raise DataError("No numeric types to aggregate")
if len(new_mgr) < len(data):
warnings.warn(
f"Dropping invalid columns in {type(self).__name__}.{how} "
Expand Down
20 changes: 6 additions & 14 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1339,20 +1339,12 @@ def _agg_general(

with group_selection_context(self):
# try a cython aggregation if we can
result = None
try:
result = self._cython_agg_general(
how=alias,
alt=npfunc,
numeric_only=numeric_only,
min_count=min_count,
)
except DataError:
pass

# apply a non-cython aggregation
if result is None:
result = self.aggregate(lambda x: npfunc(x, axis=self.axis))
result = self._cython_agg_general(
how=alias,
alt=npfunc,
numeric_only=numeric_only,
min_count=min_count,
)
return result.__finalize__(self.obj, method="groupby")

def _agg_py_fallback(
Expand Down
5 changes: 3 additions & 2 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,8 +128,9 @@ def test_groupby_aggregation_multi_level_column():
columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]),
)

result = df.groupby(level=1, axis=1).sum()
expected = DataFrame({0: [2.0, 1, 1, 1], 1: [1, 0, 1, 1]})
gb = df.groupby(level=1, axis=1)
result = gb.sum(numeric_only=False)
expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]})

tm.assert_frame_equal(result, expected)

Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/groupby/aggregate/test_cython.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
bdate_range,
)
import pandas._testing as tm
from pandas.core.groupby.groupby import DataError


@pytest.mark.parametrize(
Expand Down Expand Up @@ -98,9 +97,9 @@ def test_cython_agg_nothing_to_agg():

frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})

msg = "No numeric types to aggregate"
with pytest.raises(DataError, match=msg):
frame[["b"]].groupby(frame["a"]).mean()
result = frame[["b"]].groupby(frame["a"]).mean()
expected = DataFrame([], index=frame["a"].sort_values().drop_duplicates())
tm.assert_frame_equal(result, expected)


def test_cython_agg_nothing_to_agg_with_dates():
Expand Down
16 changes: 14 additions & 2 deletions pandas/tests/groupby/aggregate/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,15 +433,22 @@ def test_agg_over_numpy_arrays():
],
columns=["category", "arraydata"],
)
result = df.groupby("category").agg(sum)
gb = df.groupby("category")

expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
expected_index = Index([1, 2], name="category")
expected_column = ["arraydata"]
expected = DataFrame(expected_data, index=expected_index, columns=expected_column)

alt = gb.sum(numeric_only=False)
tm.assert_frame_equal(alt, expected)

result = gb.agg("sum", numeric_only=False)
tm.assert_frame_equal(result, expected)

# FIXME: the original version of this test called `gb.agg(sum)`

This comment has been minimized.

Copy link
@jbrockmendel

jbrockmendel Dec 7, 2021

Author Member

@rhshadrach shot in the dark: any idea how/if to address this?

This comment has been minimized.

Copy link
@rhshadrach

rhshadrach Dec 7, 2021

Member

When there are no args, we intercept sum and replace it with "sum". When passing in a callable, my general expectation is that under the hood, pandas is passing a group to that callable and them combining the results for me. By intercepting some callables and behaving differently, this makes the op more difficult for a user to understand (magical), more difficult to maintain, and I'd hazard a guess can result in some odd behavior in edge cases.

I'd be in favor of deprecating this behavior for sum and np.sum et al.

This comment has been minimized.

Copy link
@rhshadrach

rhshadrach Dec 7, 2021

Member

But if we still want to support this, then we can intercept in groupby.generic._aggregate_frame like we do in core.apply.

This comment has been minimized.

Copy link
@jbrockmendel

jbrockmendel Dec 8, 2021

Author Member

OK. so should we keep this comment as-is, remove it, make an xfail from this case, ...?

This comment has been minimized.

Copy link
@rhshadrach

rhshadrach Dec 9, 2021

Member

If we decide to deprecate the behavior, then I think remove it. If we keep behavior, then we should xfail until the _aggregate_frame is fixed. I'll open an issue for discussing this, and I would keep it as-is until the way forward is agreed on.

# and that raises TypeError if `numeric_only=False` is passed


@pytest.mark.parametrize("as_period", [True, False])
def test_agg_tzaware_non_datetime_result(as_period):
Expand Down Expand Up @@ -524,9 +531,14 @@ def test_sum_uint64_overflow():
)

expected.index.name = 0
result = df.groupby(0).sum()
result = df.groupby(0).sum(numeric_only=False)
tm.assert_frame_equal(result, expected)

# out column is non-numeric, so with numeric_only=True it is dropped
result2 = df.groupby(0).sum(numeric_only=True)
expected2 = expected[[]]
tm.assert_frame_equal(result2, expected2)


@pytest.mark.parametrize(
"structure, expected",
Expand Down
52 changes: 50 additions & 2 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,7 @@ def test_as_index_select_column():
def test_groupby_as_index_select_column_sum_empty_df():
# GH 35246
df = DataFrame(columns=["A", "B", "C"])
left = df.groupby(by="A", as_index=False)["B"].sum()
left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False)
assert type(left) is DataFrame
assert left.to_dict() == {"A": {}, "B": {}}

Expand Down Expand Up @@ -1861,6 +1861,49 @@ def get_result():
get_result()

return
else:
# ie. DataFrameGroupBy
if op in ["prod", "sum"]:
# ops that require more than just ordered-ness
if method != "apply":
# FIXME: apply goes through different code path
if df.dtypes[0].kind == "M":
# GH#41291
# datetime64 -> prod and sum are invalid
result = get_result()

# with numeric_only=True, these are dropped, and we get
# an empty DataFrame back
expected = df.set_index(keys)[[]]
tm.assert_equal(result, expected)
return

elif isinstance(values, Categorical):
# GH#41291
# Categorical doesn't implement sum or prod
result = get_result()

# with numeric_only=True, these are dropped, and we get
# an empty DataFrame back
expected = df.set_index(keys)[[]]
if len(keys) != 1 and op == "prod":
# TODO: why just prod and not sum?
# Categorical is special without 'observed=True'
lev = Categorical([0], dtype=values.dtype)
mi = MultiIndex.from_product([lev, lev], names=["A", "B"])
expected = DataFrame([], columns=[], index=mi)

tm.assert_equal(result, expected)
return

elif df.dtypes[0] == object:
# FIXME: the test is actually wrong here, xref #41341
result = get_result()
# In this case we have list-of-list, will raise TypeError,
# and subsequently be dropped as nuisance columns
expected = df.set_index(keys)[[]]
tm.assert_equal(result, expected)
return

result = get_result()
expected = df.set_index(keys)[columns]
Expand Down Expand Up @@ -2313,12 +2356,17 @@ def test_groupby_all_nan_groups_drop():

def test_groupby_empty_multi_column():
# GH 15106
result = DataFrame(data=[], columns=["A", "B", "C"]).groupby(["A", "B"]).sum()
df = DataFrame(data=[], columns=["A", "B", "C"])
gb = df.groupby(["A", "B"])
result = gb.sum(numeric_only=False)
expected = DataFrame(
[], columns=["C"], index=MultiIndex([[], []], [[], []], names=["A", "B"])
)
tm.assert_frame_equal(result, expected)

result = gb.sum(numeric_only=True)
tm.assert_frame_equal(result, expected[[]])


def test_groupby_filtered_df_std():
# GH 16174
Expand Down
19 changes: 14 additions & 5 deletions pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
DataFrameGroupBy,
SeriesGroupBy,
)
from pandas.core.groupby.groupby import DataError


def assert_fp_equal(a, b):
Expand Down Expand Up @@ -741,11 +740,21 @@ def test_cython_transform_frame(op, args, targop):
tm.assert_frame_equal(expected, getattr(gb, op)(*args).sort_index(axis=1))
# individual columns
for c in df:
if c not in ["float", "int", "float_missing"] and op != "shift":
msg = "No numeric types to aggregate"
with pytest.raises(DataError, match=msg):
if (
c not in ["float", "int", "float_missing"]
and op != "shift"
and not (c == "timedelta" and op == "cumsum")
):
msg = "|".join(
[
"does not support .* operations",
".* is not supported for object dtype",
"is not implemented for this dtype",
]
)
with pytest.raises(TypeError, match=msg):
gb[c].transform(op)
with pytest.raises(DataError, match=msg):
with pytest.raises(TypeError, match=msg):
getattr(gb[c], op)()
else:
expected = gb[c].apply(targop)
Expand Down

0 comments on commit 6b94e24

Please sign in to comment.