Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use new NA scalar in BooleanArray #29961

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,8 @@ cdef inline bint is_null_period(v):
def _create_binary_propagating_op(name, divmod=False):

def method(self, other):
if other is C_NA or isinstance(other, str) or isinstance(other, numbers.Number):
if (other is C_NA or isinstance(other, str)
or isinstance(other, (numbers.Number, np.bool_))):
if divmod:
return NA, NA
else:
Expand Down
95 changes: 52 additions & 43 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import numbers
from typing import TYPE_CHECKING, Type
from typing import TYPE_CHECKING, Any, Tuple, Type
import warnings

import numpy as np

from pandas._libs import lib
from pandas._libs import lib, missing as libmissing
from pandas.compat import set_function_name

from pandas.core.dtypes.base import ExtensionDtype
Expand Down Expand Up @@ -61,13 +61,13 @@ class BooleanDtype(ExtensionDtype):
@property
def na_value(self) -> "Scalar":
"""
BooleanDtype uses :attr:`numpy.nan` as the missing NA value.
TomAugspurger marked this conversation as resolved.
Show resolved Hide resolved
BooleanDtype uses :attr:`pandas.NA` as the missing NA value.

.. warning::

`na_value` may change in a future release.
"""
return np.nan
return libmissing.NA

@property
def type(self) -> Type:
Expand Down Expand Up @@ -223,7 +223,7 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin):

>>> pd.array([True, False, None], dtype="boolean")
<BooleanArray>
[True, False, NaN]
[True, False, NA]
Length: 3, dtype: boolean
"""

Expand Down Expand Up @@ -262,17 +262,17 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False):
values, mask = coerce_to_array(scalars, copy=copy)
return BooleanArray(values, mask)

def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
data = self._data.astype("int8")
data[self._mask] = -1
return data, -1

@classmethod
def _from_factorized(cls, values, original: "BooleanArray"):
return cls._from_sequence(values, dtype=original.dtype)

def _formatter(self, boxed=False):
def fmt(x):
if isna(x):
return "NaN"
return str(x)

return fmt
return str

def __getitem__(self, item):
if is_integer(item):
Expand All @@ -281,25 +281,29 @@ def __getitem__(self, item):
return self._data[item]
return type(self)(self._data[item], self._mask[item])

def _coerce_to_ndarray(self, force_bool: bool = False):
def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
"""
Coerce to an ndarary of object dtype or bool dtype (if force_bool=True).

Parameters
----------
force_bool : bool, default False
If True, return bool array or raise error if not possible (in
presence of missing values)
dtype : dtype, default object
The numpy dtype to convert to
na_value : scalar, optional
Scalar missing value indicator to use in numpy array. Defaults
to the native missing value indicator of this array (pd.NA).
"""
if force_bool:
if dtype is None:
dtype = object
if is_bool_dtype(dtype):
if not self.isna().any():
return self._data
else:
raise ValueError(
"cannot convert to bool numpy array in presence of missing values"
)
data = self._data.astype(object)
data[self._mask] = self._na_value
data = self._data.astype(dtype)
data[self._mask] = na_value
return data

__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
Expand All @@ -309,15 +313,8 @@ def __array__(self, dtype=None):
the array interface, return my values
We return an object array here to preserve our scalar values
"""
if dtype is not None:
if is_bool_dtype(dtype):
return self._coerce_to_ndarray(force_bool=True)
# TODO can optimize this to not go through object dtype for
# numeric dtypes
arr = self._coerce_to_ndarray()
return arr.astype(dtype, copy=False)
# by default (no dtype specified), return an object array
return self._coerce_to_ndarray()
return self._coerce_to_ndarray(dtype=dtype)

def __arrow_array__(self, type=None):
"""
Expand Down Expand Up @@ -483,8 +480,17 @@ def astype(self, dtype, copy=True):
return IntegerArray(
self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False
)
# for integer, error if there are missing values
if is_integer_dtype(dtype):
if self.isna().any():
raise ValueError("cannot convert NA to integer")
# for float dtype, ensure we use np.nan before casting (numpy cannot
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pending the discussion in #30038.

# deal with pd.NA)
na_value = self._na_value
if is_float_dtype(dtype):
na_value = np.nan
# coerce
data = self._coerce_to_ndarray()
data = self._coerce_to_ndarray(na_value=na_value)
return astype_nansafe(data, dtype, copy=None)

def value_counts(self, dropna=True):
Expand Down Expand Up @@ -594,8 +600,6 @@ def logical_method(self, other):

@classmethod
def _create_comparison_method(cls, op):
op_name = op.__name__

def cmp_method(self, other):

if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
Expand All @@ -617,21 +621,26 @@ def cmp_method(self, other):
if len(self) != len(other):
raise ValueError("Lengths must match to compare")

# numpy will show a DeprecationWarning on invalid elementwise
# comparisons, this will raise in the future
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
with np.errstate(all="ignore"):
result = op(self._data, other)

# nans propagate
if mask is None:
mask = self._mask
if other is libmissing.NA:
# numpy does not handle pd.NA well as "other" scalar (it returns
# a scalar False instead of an array)
result = np.zeros_like(self._data)
mask = np.ones_like(self._data)
else:
mask = self._mask | mask
# numpy will show a DeprecationWarning on invalid elementwise
# comparisons, this will raise in the future
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
with np.errstate(all="ignore"):
result = op(self._data, other)

# nans propagate
if mask is None:
mask = self._mask.copy()
else:
mask = self._mask | mask

result[mask] = op_name == "ne"
return BooleanArray(result, np.zeros(len(result), dtype=bool), copy=False)
return BooleanArray(result, mask, copy=False)

name = "__{name}__".format(name=op.__name__)
return set_function_name(cmp_method, name, cls)
Expand All @@ -643,7 +652,7 @@ def _reduce(self, name, skipna=True, **kwargs):
# coerce to a nan-aware float if needed
if mask.any():
data = self._data.astype("float64")
data[mask] = self._na_value
data[mask] = np.nan

op = getattr(nanops, "nan" + name)
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
Expand Down
77 changes: 62 additions & 15 deletions pandas/tests/arrays/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,14 @@ def test_to_boolean_array_all_none():
@pytest.mark.parametrize(
"a, b",
[
([True, None], [True, np.nan]),
([None], [np.nan]),
([None, np.nan], [np.nan, np.nan]),
([np.nan, np.nan], [np.nan, np.nan]),
([True, False, None, np.nan, pd.NA], [True, False, None, None, None]),
([True, np.nan], [True, None]),
([True, pd.NA], [True, None]),
([np.nan, np.nan], [None, None]),
(np.array([np.nan, np.nan], dtype=float), [None, None]),
],
)
def test_to_boolean_array_none_is_nan(a, b):
def test_to_boolean_array_missing_indicators(a, b):
result = pd.array(a, dtype="boolean")
expected = pd.array(b, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
Expand Down Expand Up @@ -216,7 +217,7 @@ def test_coerce_to_numpy_array():
# with missing values -> object dtype
arr = pd.array([True, False, None], dtype="boolean")
result = np.array(arr)
expected = np.array([True, False, None], dtype="object")
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)

# also with no missing values -> object dtype
Expand All @@ -238,12 +239,11 @@ def test_coerce_to_numpy_array():
def test_astype():
# with missing values
arr = pd.array([True, False, None], dtype="boolean")
msg = "cannot convert float NaN to"

with pytest.raises(ValueError, match=msg):
with pytest.raises(ValueError, match="cannot convert NA to integer"):
arr.astype("int64")

with pytest.raises(ValueError, match=msg):
with pytest.raises(ValueError, match="cannot convert float NaN to"):
arr.astype("bool")

result = arr.astype("float64")
Expand Down Expand Up @@ -280,6 +280,14 @@ def test_astype_to_integer_array():
tm.assert_extension_array_equal(result, expected)


@pytest.mark.parametrize("na", [None, np.nan, pd.NA])
def test_setitem_missing_values(na):
arr = pd.array([True, False, None], dtype="boolean")
expected = pd.array([True, None, None], dtype="boolean")
arr[1] = na
tm.assert_extension_array_equal(arr, expected)


@pytest.mark.parametrize(
"ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor]
)
Expand Down Expand Up @@ -406,9 +414,8 @@ def _compare_other(self, data, op_name, other):
# array
result = pd.Series(op(data, other))
expected = pd.Series(op(data._data, other), dtype="boolean")

# fill the nan locations
expected[data._mask] = op_name == "__ne__"
# propagate NAs
expected[data._mask] = pd.NA

tm.assert_series_equal(result, expected)

Expand All @@ -419,9 +426,8 @@ def _compare_other(self, data, op_name, other):
expected = pd.Series(data._data)
expected = op(expected, other)
expected = expected.astype("boolean")

# fill the nan locations
expected[data._mask] = op_name == "__ne__"
# propagate NAs
expected[data._mask] = pd.NA

tm.assert_series_equal(result, expected)

Expand All @@ -438,6 +444,47 @@ def test_compare_array(self, data, all_compare_operators):
other = pd.Series([True] * len(data))
self._compare_other(data, op_name, other)

@pytest.mark.parametrize("other", [True, False, pd.NA])
def test_scalar(self, other, all_compare_operators):
op = self.get_op_from_name(all_compare_operators)
a = pd.array([True, False, None], dtype="boolean")

result = op(a, other)

if other is pd.NA:
expected = pd.array([None, None, None], dtype="boolean")
else:
values = op(a._data, other)
expected = BooleanArray(values, a._mask, copy=True)
tm.assert_extension_array_equal(result, expected)

# ensure we haven't mutated anything inplace
result[0] = None
tm.assert_extension_array_equal(
a, pd.array([True, False, None], dtype="boolean")
)

def test_array(self, all_compare_operators):
op = self.get_op_from_name(all_compare_operators)
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")

result = op(a, b)

values = op(a._data, b._data)
mask = a._mask | b._mask
expected = BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)

# ensure we haven't mutated anything inplace
result[0] = None
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)


class TestArithmeticOps(BaseOpsUtil):
def test_error(self, data, all_arithmetic_operators):
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/extension/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,13 @@ def data_missing_for_sorting(dtype):

@pytest.fixture
def na_cmp():
# we are np.nan
return lambda x, y: np.isnan(x) and np.isnan(y)
# we are pd.NA
return lambda x, y: x is pd.NA and y is pd.NA


@pytest.fixture
def na_value():
return np.nan
return pd.NA


@pytest.fixture
Expand Down Expand Up @@ -160,6 +160,14 @@ def check_opname(self, s, op_name, other, exc=None):
def _compare_other(self, s, data, op_name, other):
self.check_opname(s, op_name, other)

@pytest.mark.skip(reason="Tested in tests/arrays/test_boolean.py")
def test_compare_scalar(self, data, all_compare_operators):
pass

@pytest.mark.skip(reason="Tested in tests/arrays/test_boolean.py")
def test_compare_array(self, data, all_compare_operators):
pass


class TestReshaping(base.BaseReshapingTests):
pass
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/scalar/test_na_scalar.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@ def test_arithmetic_ops(all_arithmetic_functions):

def test_comparison_ops():

for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]:
for other in [NA, 1, 1.0, "a", np.int64(1), np.nan, np.bool_(True)]:
assert (NA == other) is NA
assert (NA != other) is NA
assert (NA > other) is NA
assert (NA >= other) is NA
assert (NA < other) is NA
assert (NA <= other) is NA

if isinstance(other, np.int64):
if isinstance(other, (np.int64, np.bool_)):
# for numpy scalars we get a deprecation warning and False as result
# for equality or error for larger/lesser than
continue
Expand Down