Skip to content

Commit

Permalink
API: Use new NA scalar in BooleanArray (#29961)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche committed Dec 4, 2019
1 parent 8d5e778 commit e73ed45
Show file tree
Hide file tree
Showing 5 changed files with 129 additions and 64 deletions.
3 changes: 2 additions & 1 deletion pandas/_libs/missing.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,8 @@ cdef inline bint is_null_period(v):
def _create_binary_propagating_op(name, divmod=False):

def method(self, other):
if other is C_NA or isinstance(other, str) or isinstance(other, numbers.Number):
if (other is C_NA or isinstance(other, str)
or isinstance(other, (numbers.Number, np.bool_))):
if divmod:
return NA, NA
else:
Expand Down
95 changes: 52 additions & 43 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import numbers
from typing import TYPE_CHECKING, Type
from typing import TYPE_CHECKING, Any, Tuple, Type
import warnings

import numpy as np

from pandas._libs import lib
from pandas._libs import lib, missing as libmissing
from pandas.compat import set_function_name

from pandas.core.dtypes.base import ExtensionDtype
Expand Down Expand Up @@ -61,13 +61,13 @@ class BooleanDtype(ExtensionDtype):
@property
def na_value(self) -> "Scalar":
"""
BooleanDtype uses :attr:`numpy.nan` as the missing NA value.
BooleanDtype uses :attr:`pandas.NA` as the missing NA value.
.. warning::
`na_value` may change in a future release.
"""
return np.nan
return libmissing.NA

@property
def type(self) -> Type:
Expand Down Expand Up @@ -223,7 +223,7 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin):
>>> pd.array([True, False, None], dtype="boolean")
<BooleanArray>
[True, False, NaN]
[True, False, NA]
Length: 3, dtype: boolean
"""

Expand Down Expand Up @@ -262,17 +262,17 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False):
values, mask = coerce_to_array(scalars, copy=copy)
return BooleanArray(values, mask)

def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
data = self._data.astype("int8")
data[self._mask] = -1
return data, -1

@classmethod
def _from_factorized(cls, values, original: "BooleanArray"):
return cls._from_sequence(values, dtype=original.dtype)

def _formatter(self, boxed=False):
def fmt(x):
if isna(x):
return "NaN"
return str(x)

return fmt
return str

def __getitem__(self, item):
if is_integer(item):
Expand All @@ -281,25 +281,29 @@ def __getitem__(self, item):
return self._data[item]
return type(self)(self._data[item], self._mask[item])

def _coerce_to_ndarray(self, force_bool: bool = False):
def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
"""
Coerce to an ndarary of object dtype or bool dtype (if force_bool=True).
Parameters
----------
force_bool : bool, default False
If True, return bool array or raise error if not possible (in
presence of missing values)
dtype : dtype, default object
The numpy dtype to convert to
na_value : scalar, optional
Scalar missing value indicator to use in numpy array. Defaults
to the native missing value indicator of this array (pd.NA).
"""
if force_bool:
if dtype is None:
dtype = object
if is_bool_dtype(dtype):
if not self.isna().any():
return self._data
else:
raise ValueError(
"cannot convert to bool numpy array in presence of missing values"
)
data = self._data.astype(object)
data[self._mask] = self._na_value
data = self._data.astype(dtype)
data[self._mask] = na_value
return data

__array_priority__ = 1000 # higher than ndarray so ops dispatch to us
Expand All @@ -309,15 +313,8 @@ def __array__(self, dtype=None):
the array interface, return my values
We return an object array here to preserve our scalar values
"""
if dtype is not None:
if is_bool_dtype(dtype):
return self._coerce_to_ndarray(force_bool=True)
# TODO can optimize this to not go through object dtype for
# numeric dtypes
arr = self._coerce_to_ndarray()
return arr.astype(dtype, copy=False)
# by default (no dtype specified), return an object array
return self._coerce_to_ndarray()
return self._coerce_to_ndarray(dtype=dtype)

def __arrow_array__(self, type=None):
"""
Expand Down Expand Up @@ -483,8 +480,17 @@ def astype(self, dtype, copy=True):
return IntegerArray(
self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False
)
# for integer, error if there are missing values
if is_integer_dtype(dtype):
if self.isna().any():
raise ValueError("cannot convert NA to integer")
# for float dtype, ensure we use np.nan before casting (numpy cannot
# deal with pd.NA)
na_value = self._na_value
if is_float_dtype(dtype):
na_value = np.nan
# coerce
data = self._coerce_to_ndarray()
data = self._coerce_to_ndarray(na_value=na_value)
return astype_nansafe(data, dtype, copy=None)

def value_counts(self, dropna=True):
Expand Down Expand Up @@ -594,8 +600,6 @@ def logical_method(self, other):

@classmethod
def _create_comparison_method(cls, op):
op_name = op.__name__

def cmp_method(self, other):

if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
Expand All @@ -617,21 +621,26 @@ def cmp_method(self, other):
if len(self) != len(other):
raise ValueError("Lengths must match to compare")

# numpy will show a DeprecationWarning on invalid elementwise
# comparisons, this will raise in the future
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
with np.errstate(all="ignore"):
result = op(self._data, other)

# nans propagate
if mask is None:
mask = self._mask
if other is libmissing.NA:
# numpy does not handle pd.NA well as "other" scalar (it returns
# a scalar False instead of an array)
result = np.zeros_like(self._data)
mask = np.ones_like(self._data)
else:
mask = self._mask | mask
# numpy will show a DeprecationWarning on invalid elementwise
# comparisons, this will raise in the future
with warnings.catch_warnings():
warnings.filterwarnings("ignore", "elementwise", FutureWarning)
with np.errstate(all="ignore"):
result = op(self._data, other)

# nans propagate
if mask is None:
mask = self._mask.copy()
else:
mask = self._mask | mask

result[mask] = op_name == "ne"
return BooleanArray(result, np.zeros(len(result), dtype=bool), copy=False)
return BooleanArray(result, mask, copy=False)

name = "__{name}__".format(name=op.__name__)
return set_function_name(cmp_method, name, cls)
Expand All @@ -643,7 +652,7 @@ def _reduce(self, name, skipna=True, **kwargs):
# coerce to a nan-aware float if needed
if mask.any():
data = self._data.astype("float64")
data[mask] = self._na_value
data[mask] = np.nan

op = getattr(nanops, "nan" + name)
result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
Expand Down
77 changes: 62 additions & 15 deletions pandas/tests/arrays/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,13 +101,14 @@ def test_to_boolean_array_all_none():
@pytest.mark.parametrize(
"a, b",
[
([True, None], [True, np.nan]),
([None], [np.nan]),
([None, np.nan], [np.nan, np.nan]),
([np.nan, np.nan], [np.nan, np.nan]),
([True, False, None, np.nan, pd.NA], [True, False, None, None, None]),
([True, np.nan], [True, None]),
([True, pd.NA], [True, None]),
([np.nan, np.nan], [None, None]),
(np.array([np.nan, np.nan], dtype=float), [None, None]),
],
)
def test_to_boolean_array_none_is_nan(a, b):
def test_to_boolean_array_missing_indicators(a, b):
result = pd.array(a, dtype="boolean")
expected = pd.array(b, dtype="boolean")
tm.assert_extension_array_equal(result, expected)
Expand Down Expand Up @@ -216,7 +217,7 @@ def test_coerce_to_numpy_array():
# with missing values -> object dtype
arr = pd.array([True, False, None], dtype="boolean")
result = np.array(arr)
expected = np.array([True, False, None], dtype="object")
expected = np.array([True, False, pd.NA], dtype="object")
tm.assert_numpy_array_equal(result, expected)

# also with no missing values -> object dtype
Expand All @@ -238,12 +239,11 @@ def test_coerce_to_numpy_array():
def test_astype():
# with missing values
arr = pd.array([True, False, None], dtype="boolean")
msg = "cannot convert float NaN to"

with pytest.raises(ValueError, match=msg):
with pytest.raises(ValueError, match="cannot convert NA to integer"):
arr.astype("int64")

with pytest.raises(ValueError, match=msg):
with pytest.raises(ValueError, match="cannot convert float NaN to"):
arr.astype("bool")

result = arr.astype("float64")
Expand Down Expand Up @@ -280,6 +280,14 @@ def test_astype_to_integer_array():
tm.assert_extension_array_equal(result, expected)


@pytest.mark.parametrize("na", [None, np.nan, pd.NA])
def test_setitem_missing_values(na):
arr = pd.array([True, False, None], dtype="boolean")
expected = pd.array([True, None, None], dtype="boolean")
arr[1] = na
tm.assert_extension_array_equal(arr, expected)


@pytest.mark.parametrize(
"ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor]
)
Expand Down Expand Up @@ -406,9 +414,8 @@ def _compare_other(self, data, op_name, other):
# array
result = pd.Series(op(data, other))
expected = pd.Series(op(data._data, other), dtype="boolean")

# fill the nan locations
expected[data._mask] = op_name == "__ne__"
# propagate NAs
expected[data._mask] = pd.NA

tm.assert_series_equal(result, expected)

Expand All @@ -419,9 +426,8 @@ def _compare_other(self, data, op_name, other):
expected = pd.Series(data._data)
expected = op(expected, other)
expected = expected.astype("boolean")

# fill the nan locations
expected[data._mask] = op_name == "__ne__"
# propagate NAs
expected[data._mask] = pd.NA

tm.assert_series_equal(result, expected)

Expand All @@ -438,6 +444,47 @@ def test_compare_array(self, data, all_compare_operators):
other = pd.Series([True] * len(data))
self._compare_other(data, op_name, other)

@pytest.mark.parametrize("other", [True, False, pd.NA])
def test_scalar(self, other, all_compare_operators):
op = self.get_op_from_name(all_compare_operators)
a = pd.array([True, False, None], dtype="boolean")

result = op(a, other)

if other is pd.NA:
expected = pd.array([None, None, None], dtype="boolean")
else:
values = op(a._data, other)
expected = BooleanArray(values, a._mask, copy=True)
tm.assert_extension_array_equal(result, expected)

# ensure we haven't mutated anything inplace
result[0] = None
tm.assert_extension_array_equal(
a, pd.array([True, False, None], dtype="boolean")
)

def test_array(self, all_compare_operators):
op = self.get_op_from_name(all_compare_operators)
a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
b = pd.array([True, False, None] * 3, dtype="boolean")

result = op(a, b)

values = op(a._data, b._data)
mask = a._mask | b._mask
expected = BooleanArray(values, mask)
tm.assert_extension_array_equal(result, expected)

# ensure we haven't mutated anything inplace
result[0] = None
tm.assert_extension_array_equal(
a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
)
tm.assert_extension_array_equal(
b, pd.array([True, False, None] * 3, dtype="boolean")
)


class TestArithmeticOps(BaseOpsUtil):
def test_error(self, data, all_arithmetic_operators):
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/extension/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,13 @@ def data_missing_for_sorting(dtype):

@pytest.fixture
def na_cmp():
# we are np.nan
return lambda x, y: np.isnan(x) and np.isnan(y)
# we are pd.NA
return lambda x, y: x is pd.NA and y is pd.NA


@pytest.fixture
def na_value():
return np.nan
return pd.NA


@pytest.fixture
Expand Down Expand Up @@ -160,6 +160,14 @@ def check_opname(self, s, op_name, other, exc=None):
def _compare_other(self, s, data, op_name, other):
self.check_opname(s, op_name, other)

@pytest.mark.skip(reason="Tested in tests/arrays/test_boolean.py")
def test_compare_scalar(self, data, all_compare_operators):
pass

@pytest.mark.skip(reason="Tested in tests/arrays/test_boolean.py")
def test_compare_array(self, data, all_compare_operators):
pass


class TestReshaping(base.BaseReshapingTests):
pass
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/scalar/test_na_scalar.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@ def test_arithmetic_ops(all_arithmetic_functions):

def test_comparison_ops():

for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]:
for other in [NA, 1, 1.0, "a", np.int64(1), np.nan, np.bool_(True)]:
assert (NA == other) is NA
assert (NA != other) is NA
assert (NA > other) is NA
assert (NA >= other) is NA
assert (NA < other) is NA
assert (NA <= other) is NA

if isinstance(other, np.int64):
if isinstance(other, (np.int64, np.bool_)):
# for numpy scalars we get a deprecation warning and False as result
# for equality or error for larger/lesser than
continue
Expand Down

0 comments on commit e73ed45

Please sign in to comment.