BUG: DataFrame reductions inconsistent with Series counterparts (#37827)

pandas-dev · Nov 14, 2020 · 50ae0bf · 50ae0bf
1 parent 840c142
commit 50ae0bf
Show file tree

Hide file tree

Showing 4 changed files with 195 additions and 36 deletions.
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -284,6 +284,63 @@ of columns could result in a larger :class:`Series` result. See (:issue:`37799`)
     In [6]: df[["B", "C"]].all(bool_only=True)
 
 
+Other :class:`DataFrame` reductions with ``numeric_only=None`` will also avoid
+this pathological behavior (:issue:`37827`):
+
+.. ipython:: python
+
+    df = pd.DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object)
+
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+    In [3]: df.mean()
+    Out[3]: Series([], dtype: float64)
+
+    In [4]: df[["A"]].mean()
+    Out[4]:
+    A    1.0
+    dtype: float64
+
+*New behavior*:
+
+.. ipython:: python
+
+    df.mean()
+
+    df[["A"]].mean()
+
+Moreover, :class:`DataFrame` reductions with ``numeric_only=None`` will now be
+consistent with their :class:`Series` counterparts.  In particular, for
+reductions where the :class:`Series` method raises ``TypeError``, the
+:class:`DataFrame` reduction will now consider that column non-numeric
+instead of casting to NumPy which may have different semantics (:issue:`36076`,
+:issue:`28949`, :issue:`21020`).
+
+.. ipython:: python
+
+    ser = pd.Series([0, 1], dtype="category", name="A")
+    df = ser.to_frame()
+
+
+*Previous behavior*:
+
+.. code-block:: ipython
+
+    In [5]: df.any()
+    Out[5]:
+    A    True
+    dtype: bool
+
+*New behavior*:
+
+.. ipython:: python
+
+    df.any()
+
+
 .. _whatsnew_120.api_breaking.python:
 
 Increased minimum version for Python

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -8765,7 +8765,7 @@ def _get_data() -> DataFrame:
                 data = self._get_bool_data()
             return data
 
-        if numeric_only is not None:
+        if numeric_only is not None or axis == 0:
             # For numeric_only non-None and axis non-None, we know
             #  which blocks to use and no try/except is needed.
             #  For numeric_only=None only the case with axis==0 and no object
@@ -8790,36 +8790,14 @@ def _get_data() -> DataFrame:
                 # GH#35865 careful to cast explicitly to object
                 nvs = coerce_to_dtypes(out.values, df.dtypes.iloc[np.sort(indexer)])
                 out[:] = np.array(nvs, dtype=object)
+            if axis == 0 and len(self) == 0 and name in ["sum", "prod"]:
+                # Even if we are object dtype, follow numpy and return
+                #  float64, see test_apply_funcs_over_empty
+                out = out.astype(np.float64)
             return out
 
         assert numeric_only is None
 
-        if not self._is_homogeneous_type or self._mgr.any_extension_types:
-            # try to avoid self.values call
-
-            if filter_type is None and axis == 0:
-                # operate column-wise
-
-                # numeric_only must be None here, as other cases caught above
-
-                # this can end up with a non-reduction
-                # but not always. if the types are mixed
-                # with datelike then need to make sure a series
-
-                # we only end up here if we have not specified
-                # numeric_only and yet we have tried a
-                # column-by-column reduction, where we have mixed type.
-                # So let's just do what we can
-                from pandas.core.apply import frame_apply
-
-                opa = frame_apply(
-                    self, func=func, result_type="expand", ignore_failures=True
-                )
-                result = opa.get_result()
-                if result.ndim == self.ndim:
-                    result = result.iloc[0].rename(None)
-                return result
-
         data = self
         values = data.values
 

diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -464,7 +464,9 @@ def _split(self) -> List["Block"]:
             new_blocks.append(nb)
         return new_blocks
 
-    def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]:
+    def split_and_operate(
+        self, mask, f, inplace: bool, ignore_failures: bool = False
+    ) -> List["Block"]:
         """
         split the block per-column, and apply the callable f
         per-column, return a new block for each. Handle
@@ -474,7 +476,8 @@ def split_and_operate(self, mask, f, inplace: bool) -> List["Block"]:
         ----------
         mask : 2-d boolean mask
         f : callable accepting (1d-mask, 1d values, indexer)
-        inplace : boolean
+        inplace : bool
+        ignore_failures : bool, default False
 
         Returns
         -------
@@ -513,8 +516,16 @@ def make_a_block(nv, ref_loc):
             v = new_values[i]
 
             # need a new block
-            if m.any():
-                nv = f(m, v, i)
+            if m.any() or m.size == 0:
+                # Apply our function; we may ignore_failures if this is a
+                #  reduction that is dropping nuisance columns GH#37827
+                try:
+                    nv = f(m, v, i)
+                except TypeError:
+                    if ignore_failures:
+                        continue
+                    else:
+                        raise
             else:
                 nv = v if inplace else v.copy()
 
@@ -2459,7 +2470,9 @@ def mask_func(mask, values, inplace):
                     values = values.reshape(1, -1)
                 return func(values)
 
-            return self.split_and_operate(None, mask_func, False)
+            return self.split_and_operate(
+                None, mask_func, False, ignore_failures=ignore_failures
+            )
 
         try:
             res = func(values)

diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py
@@ -12,6 +12,7 @@
 from pandas import (
     Categorical,
     DataFrame,
+    Index,
     MultiIndex,
     Series,
     Timestamp,
@@ -1083,10 +1084,12 @@ def test_any_all_bool_only(self):
             pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True),
             pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True),
             pytest.param(np.any, {"A": Series([1, 2], dtype="m8[ns]")}, True),
-            (np.all, {"A": Series([0, 1], dtype="category")}, False),
-            (np.any, {"A": Series([0, 1], dtype="category")}, True),
+            # np.all on Categorical raises, so the reduction drops the
+            #  column, so all is being done on an empty Series, so is True
+            (np.all, {"A": Series([0, 1], dtype="category")}, True),
+            (np.any, {"A": Series([0, 1], dtype="category")}, False),
             (np.all, {"A": Series([1, 2], dtype="category")}, True),
-            (np.any, {"A": Series([1, 2], dtype="category")}, True),
+            (np.any, {"A": Series([1, 2], dtype="category")}, False),
             # Mix GH#21484
             pytest.param(
                 np.all,
@@ -1308,6 +1311,114 @@ def test_frame_any_with_timedelta(self):
         tm.assert_series_equal(result, expected)
 
 
+class TestNuisanceColumns:
+    @pytest.mark.parametrize("method", ["any", "all"])
+    def test_any_all_categorical_dtype_nuisance_column(self, method):
+        # GH#36076 DataFrame should match Series behavior
+        ser = Series([0, 1], dtype="category", name="A")
+        df = ser.to_frame()
+
+        # Double-check the Series behavior is to raise
+        with pytest.raises(TypeError, match="does not implement reduction"):
+            getattr(ser, method)()
+
+        with pytest.raises(TypeError, match="does not implement reduction"):
+            getattr(np, method)(ser)
+
+        with pytest.raises(TypeError, match="does not implement reduction"):
+            getattr(df, method)(bool_only=False)
+
+        # With bool_only=None, operating on this column raises and is ignored,
+        #  so we expect an empty result.
+        result = getattr(df, method)(bool_only=None)
+        expected = Series([], index=Index([]), dtype=bool)
+        tm.assert_series_equal(result, expected)
+
+        result = getattr(np, method)(df, axis=0)
+        tm.assert_series_equal(result, expected)
+
+    def test_median_categorical_dtype_nuisance_column(self):
+        # GH#21020 DataFrame.median should match Series.median
+        df = DataFrame({"A": Categorical([1, 2, 2, 2, 3])})
+        ser = df["A"]
+
+        # Double-check the Series behavior is to raise
+        with pytest.raises(TypeError, match="does not implement reduction"):
+            ser.median()
+
+        with pytest.raises(TypeError, match="does not implement reduction"):
+            df.median(numeric_only=False)
+
+        result = df.median()
+        expected = Series([], index=Index([]), dtype=np.float64)
+        tm.assert_series_equal(result, expected)
+
+        # same thing, but with an additional non-categorical column
+        df["B"] = df["A"].astype(int)
+
+        with pytest.raises(TypeError, match="does not implement reduction"):
+            df.median(numeric_only=False)
+
+        result = df.median()
+        expected = Series([2.0], index=["B"])
+        tm.assert_series_equal(result, expected)
+
+        # TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead
+        #  of expected.values
+
+    @pytest.mark.parametrize("method", ["min", "max"])
+    def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
+        # GH#28949 DataFrame.min should behave like Series.min
+        cat = Categorical(["a", "b", "c", "b"], ordered=False)
+        ser = Series(cat)
+        df = ser.to_frame("A")
+
+        # Double-check the Series behavior
+        with pytest.raises(TypeError, match="is not ordered for operation"):
+            getattr(ser, method)()
+
+        with pytest.raises(TypeError, match="is not ordered for operation"):
+            getattr(np, method)(ser)
+
+        with pytest.raises(TypeError, match="is not ordered for operation"):
+            getattr(df, method)(numeric_only=False)
+
+        result = getattr(df, method)()
+        expected = Series([], index=Index([]), dtype=np.float64)
+        tm.assert_series_equal(result, expected)
+
+        result = getattr(np, method)(df)
+        tm.assert_series_equal(result, expected)
+
+        # same thing, but with an additional non-categorical column
+        df["B"] = df["A"].astype(object)
+        result = getattr(df, method)()
+        if method == "min":
+            expected = Series(["a"], index=["B"])
+        else:
+            expected = Series(["c"], index=["B"])
+        tm.assert_series_equal(result, expected)
+
+        result = getattr(np, method)(df)
+        tm.assert_series_equal(result, expected)
+
+    def test_reduction_object_block_splits_nuisance_columns(self):
+        # GH#37827
+        df = DataFrame({"A": [0, 1, 2], "B": ["a", "b", "c"]}, dtype=object)
+
+        # We should only exclude "B", not "A"
+        result = df.mean()
+        expected = Series([1.0], index=["A"])
+        tm.assert_series_equal(result, expected)
+
+        # Same behavior but heterogeneous dtype
+        df["C"] = df["A"].astype(int) + 4
+
+        result = df.mean()
+        expected = Series([1.0, 5.0], index=["A", "C"])
+        tm.assert_series_equal(result, expected)
+
+
 def test_sum_timedelta64_skipna_false():
     # GH#17235
     arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2)
@@ -1352,6 +1463,6 @@ def test_minmax_extensionarray(method, numeric_only):
     df = DataFrame({"Int64": ser})
     result = getattr(df, method)(numeric_only=numeric_only)
     expected = Series(
-        [getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object")
+        [getattr(int64_info, method)], index=Index(["Int64"], dtype="object")
     )
     tm.assert_series_equal(result, expected)