From 86db6862ef4d5a74784621c55703a415f1cd4cda Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 28 Oct 2022 07:46:45 -0700 Subject: [PATCH] DEPR: object-dtype bool_only --- doc/source/whatsnew/v2.0.0.rst | 2 ++ pandas/core/dtypes/inference.py | 42 ------------------------ pandas/core/internals/array_manager.py | 3 +- pandas/core/internals/blocks.py | 3 +- pandas/tests/frame/test_reductions.py | 28 ++++++---------- pandas/tests/internals/test_internals.py | 9 ++--- 6 files changed, 17 insertions(+), 70 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 252c444b2e60c..29436b5095776 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -257,6 +257,8 @@ Removal of prior version deprecations/changes - Changed behavior of :class:`DataFrame` constructor when passed a ``dtype`` (other than int) that the data cannot be cast to; it now raises instead of silently ignoring the dtype (:issue:`41733`) - Changed the behavior of :class:`Series` constructor, it will no longer infer a datetime64 or timedelta64 dtype from string entries (:issue:`41731`) - Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`) +- Changed behavior of :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``; object-dtype columns with all-bool values will no longer be included, manually cast to ``bool`` dtype first (:issue:`46188`) +- .. --------------------------------------------------------------------------- .. _whatsnew_200.performance: diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 893e4a9be58ef..de240a39e2951 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -6,13 +6,10 @@ from numbers import Number import re from typing import Pattern -import warnings import numpy as np from pandas._libs import lib -from pandas._typing import ArrayLike -from pandas.util._exceptions import find_stack_level is_bool = lib.is_bool @@ -425,42 +422,3 @@ def is_dataclass(item): return is_dataclass(item) and not isinstance(item, type) except ImportError: return False - - -def is_inferred_bool_dtype(arr: ArrayLike) -> bool: - """ - Check if this is a ndarray[bool] or an ndarray[object] of bool objects. - - Parameters - ---------- - arr : np.ndarray or ExtensionArray - - Returns - ------- - bool - - Notes - ----- - This does not include the special treatment is_bool_dtype uses for - Categorical. - """ - if not isinstance(arr, np.ndarray): - return False - - dtype = arr.dtype - if dtype == np.dtype(bool): - return True - elif dtype == np.dtype("object"): - result = lib.is_bool_array(arr) - if result: - # GH#46188 - warnings.warn( - "In a future version, object-dtype columns with all-bool values " - "will not be included in reductions with bool_only=True. " - "Explicitly cast to bool dtype instead.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return result - - return False diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 4aa16257b0802..f6e50d658a580 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -52,7 +52,6 @@ ABCDataFrame, ABCSeries, ) -from pandas.core.dtypes.inference import is_inferred_bool_dtype from pandas.core.dtypes.missing import ( array_equals, isna, @@ -488,7 +487,7 @@ def get_bool_data(self: T, copy: bool = False) -> T: copy : bool, default False Whether to copy the blocks """ - return self._get_data_subset(is_inferred_bool_dtype) + return self._get_data_subset(lambda x: x.dtype == np.dtype(bool)) def get_numeric_data(self: T, copy: bool = False) -> T: """ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 83c1ca0084724..f0fdd9a58720e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -70,7 +70,6 @@ ABCPandasArray, ABCSeries, ) -from pandas.core.dtypes.inference import is_inferred_bool_dtype from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, @@ -194,7 +193,7 @@ def is_bool(self) -> bool: """ We can be bool if a) we are bool dtype or b) object dtype with bool objects. """ - return is_inferred_bool_dtype(self.values) + return self.values.dtype == np.dtype(bool) @final def external_values(self): diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index 963ed24cb434b..0ce17a050da82 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1280,7 +1280,6 @@ def test_any_all_object(self): assert result is False def test_any_all_object_bool_only(self): - msg = "object-dtype columns with all-bool values" df = DataFrame({"A": ["foo", 2], "B": [True, False]}).astype(object) df._consolidate_inplace() @@ -1291,36 +1290,29 @@ def test_any_all_object_bool_only(self): # The underlying bug is in DataFrame._get_bool_data, so we check # that while we're here - with tm.assert_produces_warning(FutureWarning, match=msg): - res = df._get_bool_data() - expected = df[["B", "C"]] + res = df._get_bool_data() + expected = df[["C"]] tm.assert_frame_equal(res, expected) - with tm.assert_produces_warning(FutureWarning, match=msg): - res = df.all(bool_only=True, axis=0) - expected = Series([False, True], index=["B", "C"]) + res = df.all(bool_only=True, axis=0) + expected = Series([True], index=["C"]) tm.assert_series_equal(res, expected) # operating on a subset of columns should not produce a _larger_ Series - with tm.assert_produces_warning(FutureWarning, match=msg): - res = df[["B", "C"]].all(bool_only=True, axis=0) + res = df[["B", "C"]].all(bool_only=True, axis=0) tm.assert_series_equal(res, expected) - with tm.assert_produces_warning(FutureWarning, match=msg): - assert not df.all(bool_only=True, axis=None) + assert df.all(bool_only=True, axis=None) - with tm.assert_produces_warning(FutureWarning, match=msg): - res = df.any(bool_only=True, axis=0) - expected = Series([True, True], index=["B", "C"]) + res = df.any(bool_only=True, axis=0) + expected = Series([True], index=["C"]) tm.assert_series_equal(res, expected) # operating on a subset of columns should not produce a _larger_ Series - with tm.assert_produces_warning(FutureWarning, match=msg): - res = df[["B", "C"]].any(bool_only=True, axis=0) + res = df[["C"]].any(bool_only=True, axis=0) tm.assert_series_equal(res, expected) - with tm.assert_produces_warning(FutureWarning, match=msg): - assert df.any(bool_only=True, axis=None) + assert df.any(bool_only=True, axis=None) @pytest.mark.parametrize("method", ["any", "all"]) def test_any_all_level_axis_none_raises(self, method): diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index b30b27f5bae1a..b64220d90f9a2 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -795,7 +795,6 @@ def test_get_numeric_data(self, using_copy_on_write): ) def test_get_bool_data(self, using_copy_on_write): - msg = "object-dtype columns with all-bool values" mgr = create_mgr( "int: int; float: float; complex: complex;" "str: object; bool: bool; obj: object; dt: datetime", @@ -803,9 +802,8 @@ def test_get_bool_data(self, using_copy_on_write): ) mgr.iset(6, np.array([True, False, True], dtype=np.object_)) - with tm.assert_produces_warning(FutureWarning, match=msg): - bools = mgr.get_bool_data() - tm.assert_index_equal(bools.items, Index(["bool", "dt"])) + bools = mgr.get_bool_data() + tm.assert_index_equal(bools.items, Index(["bool"])) tm.assert_almost_equal( mgr.iget(mgr.items.get_loc("bool")).internal_values(), bools.iget(bools.items.get_loc("bool")).internal_values(), @@ -824,8 +822,7 @@ def test_get_bool_data(self, using_copy_on_write): ) # Check sharing - with tm.assert_produces_warning(FutureWarning, match=msg): - bools2 = mgr.get_bool_data(copy=True) + bools2 = mgr.get_bool_data(copy=True) bools2.iset(0, np.array([False, True, False])) if using_copy_on_write: tm.assert_numpy_array_equal(