Skip to content

Commit

Permalink
BUG: DataFrame reductions inconsistent with Series counterparts (#37827)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Nov 14, 2020
1 parent 840c142 commit 50ae0bf
Show file tree
Hide file tree
Showing 4 changed files with 195 additions and 36 deletions.
57 changes: 57 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,63 @@ of columns could result in a larger :class:`Series` result. See (:issue:`37799`)
In [6]: df[["B", "C"]].all(bool_only=True)
Other :class:`DataFrame` reductions with ``numeric_only=None`` will also avoid
this pathological behavior (:issue:`37827`):

.. ipython:: python
df = pd.DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object)
*Previous behavior*:

.. code-block:: ipython
In [3]: df.mean()
Out[3]: Series([], dtype: float64)
In [4]: df[["A"]].mean()
Out[4]:
A 1.0
dtype: float64
*New behavior*:

.. ipython:: python
df.mean()
df[["A"]].mean()
Moreover, :class:`DataFrame` reductions with ``numeric_only=None`` will now be
consistent with their :class:`Series` counterparts. In particular, for
reductions where the :class:`Series` method raises ``TypeError``, the
:class:`DataFrame` reduction will now consider that column non-numeric
instead of casting to NumPy which may have different semantics (:issue:`36076`,
:issue:`28949`, :issue:`21020`).

.. ipython:: python
ser = pd.Series([0, 1], dtype="category", name="A")
df = ser.to_frame()
*Previous behavior*:

.. code-block:: ipython
In [5]: df.any()
Out[5]:
A True
dtype: bool
*New behavior*:

.. ipython:: python
df.any()
.. _whatsnew_120.api_breaking.python:

Increased minimum version for Python
Expand Down
32 changes: 5 additions & 27 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -8765,7 +8765,7 @@ def _get_data() -> DataFrame:
data = self._get_bool_data()
return data

if numeric_only is not None:
if numeric_only is not None or axis == 0:
# For numeric_only non-None and axis non-None, we know
# which blocks to use and no try/except is needed.
# For numeric_only=None only the case with axis==0 and no object
Expand All @@ -8790,36 +8790,14 @@ def _get_data() -> DataFrame:
# GH#35865 careful to cast explicitly to object
nvs = coerce_to_dtypes(out.values, df.dtypes.iloc[np.sort(indexer)])
out[:] = np.array(nvs, dtype=object)
if axis == 0 and len(self) == 0 and name in ["sum", "prod"]:
# Even if we are object dtype, follow numpy and return
# float64, see test_apply_funcs_over_empty
out = out.astype(np.float64)
return out

assert numeric_only is None

if not self._is_homogeneous_type or self._mgr.any_extension_types:
# try to avoid self.values call

if filter_type is None and axis == 0:
# operate column-wise

# numeric_only must be None here, as other cases caught above

# this can end up with a non-reduction
# but not always. if the types are mixed
# with datelike then need to make sure a series

# we only end up here if we have not specified
# numeric_only and yet we have tried a
# column-by-column reduction, where we have mixed type.
# So let's just do what we can
from pandas.core.apply import frame_apply

opa = frame_apply(
self, func=func, result_type="expand", ignore_failures=True
)
result = opa.get_result()
if result.ndim == self.ndim:
result = result.iloc[0].rename(None)
return result

data = self
values = data.values

Expand Down
23 changes: 18 additions & 5 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,9 @@ def _split(self) -> List["Block"]:
new_blocks.append(nb)
return new_blocks

def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]:
def split_and_operate(
self, mask, f, inplace: bool, ignore_failures: bool = False
) -> List["Block"]:
"""
split the block per-column, and apply the callable f
per-column, return a new block for each. Handle
Expand All @@ -474,7 +476,8 @@ def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]:
----------
mask : 2-d boolean mask
f : callable accepting (1d-mask, 1d values, indexer)
inplace : boolean
inplace : bool
ignore_failures : bool, default False
Returns
-------
Expand Down Expand Up @@ -513,8 +516,16 @@ def make_a_block(nv, ref_loc):
v = new_values[i]

# need a new block
if m.any():
nv = f(m, v, i)
if m.any() or m.size == 0:
# Apply our function; we may ignore_failures if this is a
# reduction that is dropping nuisance columns GH#37827
try:
nv = f(m, v, i)
except TypeError:
if ignore_failures:
continue
else:
raise
else:
nv = v if inplace else v.copy()

Expand Down Expand Up @@ -2459,7 +2470,9 @@ def mask_func(mask, values, inplace):
values = values.reshape(1, -1)
return func(values)

return self.split_and_operate(None, mask_func, False)
return self.split_and_operate(
None, mask_func, False, ignore_failures=ignore_failures
)

try:
res = func(values)
Expand Down
119 changes: 115 additions & 4 deletions pandas/tests/frame/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pandas import (
Categorical,
DataFrame,
Index,
MultiIndex,
Series,
Timestamp,
Expand Down Expand Up @@ -1083,10 +1084,12 @@ def test_any_all_bool_only(self):
pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True),
pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True),
pytest.param(np.any, {"A": Series([1, 2], dtype="m8[ns]")}, True),
(np.all, {"A": Series([0, 1], dtype="category")}, False),
(np.any, {"A": Series([0, 1], dtype="category")}, True),
# np.all on Categorical raises, so the reduction drops the
# column, so all is being done on an empty Series, so is True
(np.all, {"A": Series([0, 1], dtype="category")}, True),
(np.any, {"A": Series([0, 1], dtype="category")}, False),
(np.all, {"A": Series([1, 2], dtype="category")}, True),
(np.any, {"A": Series([1, 2], dtype="category")}, True),
(np.any, {"A": Series([1, 2], dtype="category")}, False),
# Mix GH#21484
pytest.param(
np.all,
Expand Down Expand Up @@ -1308,6 +1311,114 @@ def test_frame_any_with_timedelta(self):
tm.assert_series_equal(result, expected)


class TestNuisanceColumns:
@pytest.mark.parametrize("method", ["any", "all"])
def test_any_all_categorical_dtype_nuisance_column(self, method):
# GH#36076 DataFrame should match Series behavior
ser = Series([0, 1], dtype="category", name="A")
df = ser.to_frame()

# Double-check the Series behavior is to raise
with pytest.raises(TypeError, match="does not implement reduction"):
getattr(ser, method)()

with pytest.raises(TypeError, match="does not implement reduction"):
getattr(np, method)(ser)

with pytest.raises(TypeError, match="does not implement reduction"):
getattr(df, method)(bool_only=False)

# With bool_only=None, operating on this column raises and is ignored,
# so we expect an empty result.
result = getattr(df, method)(bool_only=None)
expected = Series([], index=Index([]), dtype=bool)
tm.assert_series_equal(result, expected)

result = getattr(np, method)(df, axis=0)
tm.assert_series_equal(result, expected)

def test_median_categorical_dtype_nuisance_column(self):
# GH#21020 DataFrame.median should match Series.median
df = DataFrame({"A": Categorical([1, 2, 2, 2, 3])})
ser = df["A"]

# Double-check the Series behavior is to raise
with pytest.raises(TypeError, match="does not implement reduction"):
ser.median()

with pytest.raises(TypeError, match="does not implement reduction"):
df.median(numeric_only=False)

result = df.median()
expected = Series([], index=Index([]), dtype=np.float64)
tm.assert_series_equal(result, expected)

# same thing, but with an additional non-categorical column
df["B"] = df["A"].astype(int)

with pytest.raises(TypeError, match="does not implement reduction"):
df.median(numeric_only=False)

result = df.median()
expected = Series([2.0], index=["B"])
tm.assert_series_equal(result, expected)

# TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead
# of expected.values

@pytest.mark.parametrize("method", ["min", "max"])
def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
# GH#28949 DataFrame.min should behave like Series.min
cat = Categorical(["a", "b", "c", "b"], ordered=False)
ser = Series(cat)
df = ser.to_frame("A")

# Double-check the Series behavior
with pytest.raises(TypeError, match="is not ordered for operation"):
getattr(ser, method)()

with pytest.raises(TypeError, match="is not ordered for operation"):
getattr(np, method)(ser)

with pytest.raises(TypeError, match="is not ordered for operation"):
getattr(df, method)(numeric_only=False)

result = getattr(df, method)()
expected = Series([], index=Index([]), dtype=np.float64)
tm.assert_series_equal(result, expected)

result = getattr(np, method)(df)
tm.assert_series_equal(result, expected)

# same thing, but with an additional non-categorical column
df["B"] = df["A"].astype(object)
result = getattr(df, method)()
if method == "min":
expected = Series(["a"], index=["B"])
else:
expected = Series(["c"], index=["B"])
tm.assert_series_equal(result, expected)

result = getattr(np, method)(df)
tm.assert_series_equal(result, expected)

def test_reduction_object_block_splits_nuisance_columns(self):
# GH#37827
df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object)

# We should only exclude "B", not "A"
result = df.mean()
expected = Series([1.0], index=["A"])
tm.assert_series_equal(result, expected)

# Same behavior but heterogeneous dtype
df["C"] = df["A"].astype(int) + 4

result = df.mean()
expected = Series([1.0, 5.0], index=["A", "C"])
tm.assert_series_equal(result, expected)


def test_sum_timedelta64_skipna_false():
# GH#17235
arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2)
Expand Down Expand Up @@ -1352,6 +1463,6 @@ def test_minmax_extensionarray(method, numeric_only):
df = DataFrame({"Int64": ser})
result = getattr(df, method)(numeric_only=numeric_only)
expected = Series(
[getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object")
[getattr(int64_info, method)], index=Index(["Int64"], dtype="object")
)
tm.assert_series_equal(result, expected)

0 comments on commit 50ae0bf

Please sign in to comment.