API: Use new NA scalar in BooleanArray (#29961)

pandas-dev · Dec 4, 2019 · e73ed45 · e73ed45
1 parent 8d5e778
commit e73ed45
Show file tree

Hide file tree

Showing 5 changed files with 129 additions and 64 deletions.
diff --git a/pandas/_libs/missing.pyx b/pandas/_libs/missing.pyx
@@ -289,7 +289,8 @@ cdef inline bint is_null_period(v):
 def _create_binary_propagating_op(name, divmod=False):
 
     def method(self, other):
-        if other is C_NA or isinstance(other, str) or isinstance(other, numbers.Number):
+        if (other is C_NA or isinstance(other, str)
+                or isinstance(other, (numbers.Number, np.bool_))):
             if divmod:
                 return NA, NA
             else:

diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py
@@ -1,10 +1,10 @@
 import numbers
-from typing import TYPE_CHECKING, Type
+from typing import TYPE_CHECKING, Any, Tuple, Type
 import warnings
 
 import numpy as np
 
-from pandas._libs import lib
+from pandas._libs import lib, missing as libmissing
 from pandas.compat import set_function_name
 
 from pandas.core.dtypes.base import ExtensionDtype
@@ -61,13 +61,13 @@ class BooleanDtype(ExtensionDtype):
     @property
     def na_value(self) -> "Scalar":
         """
-        BooleanDtype uses :attr:`numpy.nan` as the missing NA value.
+        BooleanDtype uses :attr:`pandas.NA` as the missing NA value.
 
         .. warning::
 
            `na_value` may change in a future release.
         """
-        return np.nan
+        return libmissing.NA
 
     @property
     def type(self) -> Type:
@@ -223,7 +223,7 @@ class BooleanArray(ExtensionArray, ExtensionOpsMixin):
 
     >>> pd.array([True, False, None], dtype="boolean")
     <BooleanArray>
-    [True, False, NaN]
+    [True, False, NA]
     Length: 3, dtype: boolean
     """
 
@@ -262,17 +262,17 @@ def _from_sequence(cls, scalars, dtype=None, copy: bool = False):
         values, mask = coerce_to_array(scalars, copy=copy)
         return BooleanArray(values, mask)
 
+    def _values_for_factorize(self) -> Tuple[np.ndarray, Any]:
+        data = self._data.astype("int8")
+        data[self._mask] = -1
+        return data, -1
+
     @classmethod
     def _from_factorized(cls, values, original: "BooleanArray"):
         return cls._from_sequence(values, dtype=original.dtype)
 
     def _formatter(self, boxed=False):
-        def fmt(x):
-            if isna(x):
-                return "NaN"
-            return str(x)
-
-        return fmt
+        return str
 
     def __getitem__(self, item):
         if is_integer(item):
@@ -281,25 +281,29 @@ def __getitem__(self, item):
             return self._data[item]
         return type(self)(self._data[item], self._mask[item])
 
-    def _coerce_to_ndarray(self, force_bool: bool = False):
+    def _coerce_to_ndarray(self, dtype=None, na_value: "Scalar" = libmissing.NA):
         """
         Coerce to an ndarary of object dtype or bool dtype (if force_bool=True).
 
         Parameters
         ----------
-        force_bool : bool, default False
-            If True, return bool array or raise error if not possible (in
-            presence of missing values)
+        dtype : dtype, default object
+            The numpy dtype to convert to
+        na_value : scalar, optional
+             Scalar missing value indicator to use in numpy array. Defaults
+             to the native missing value indicator of this array (pd.NA).
         """
-        if force_bool:
+        if dtype is None:
+            dtype = object
+        if is_bool_dtype(dtype):
             if not self.isna().any():
                 return self._data
             else:
                 raise ValueError(
                     "cannot convert to bool numpy array in presence of missing values"
                 )
-        data = self._data.astype(object)
-        data[self._mask] = self._na_value
+        data = self._data.astype(dtype)
+        data[self._mask] = na_value
         return data
 
     __array_priority__ = 1000  # higher than ndarray so ops dispatch to us
@@ -309,15 +313,8 @@ def __array__(self, dtype=None):
         the array interface, return my values
         We return an object array here to preserve our scalar values
         """
-        if dtype is not None:
-            if is_bool_dtype(dtype):
-                return self._coerce_to_ndarray(force_bool=True)
-            # TODO can optimize this to not go through object dtype for
-            # numeric dtypes
-            arr = self._coerce_to_ndarray()
-            return arr.astype(dtype, copy=False)
         # by default (no dtype specified), return an object array
-        return self._coerce_to_ndarray()
+        return self._coerce_to_ndarray(dtype=dtype)
 
     def __arrow_array__(self, type=None):
         """
@@ -483,8 +480,17 @@ def astype(self, dtype, copy=True):
             return IntegerArray(
                 self._data.astype(dtype.numpy_dtype), self._mask.copy(), copy=False
             )
+        # for integer, error if there are missing values
+        if is_integer_dtype(dtype):
+            if self.isna().any():
+                raise ValueError("cannot convert NA to integer")
+        # for float dtype, ensure we use np.nan before casting (numpy cannot
+        # deal with pd.NA)
+        na_value = self._na_value
+        if is_float_dtype(dtype):
+            na_value = np.nan
         # coerce
-        data = self._coerce_to_ndarray()
+        data = self._coerce_to_ndarray(na_value=na_value)
         return astype_nansafe(data, dtype, copy=None)
 
     def value_counts(self, dropna=True):
@@ -594,8 +600,6 @@ def logical_method(self, other):
 
     @classmethod
     def _create_comparison_method(cls, op):
-        op_name = op.__name__
-
         def cmp_method(self, other):
 
             if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)):
@@ -617,21 +621,26 @@ def cmp_method(self, other):
                 if len(self) != len(other):
                     raise ValueError("Lengths must match to compare")
 
-            # numpy will show a DeprecationWarning on invalid elementwise
-            # comparisons, this will raise in the future
-            with warnings.catch_warnings():
-                warnings.filterwarnings("ignore", "elementwise", FutureWarning)
-                with np.errstate(all="ignore"):
-                    result = op(self._data, other)
-
-            # nans propagate
-            if mask is None:
-                mask = self._mask
+            if other is libmissing.NA:
+                # numpy does not handle pd.NA well as "other" scalar (it returns
+                # a scalar False instead of an array)
+                result = np.zeros_like(self._data)
+                mask = np.ones_like(self._data)
             else:
-                mask = self._mask | mask
+                # numpy will show a DeprecationWarning on invalid elementwise
+                # comparisons, this will raise in the future
+                with warnings.catch_warnings():
+                    warnings.filterwarnings("ignore", "elementwise", FutureWarning)
+                    with np.errstate(all="ignore"):
+                        result = op(self._data, other)
+
+                # nans propagate
+                if mask is None:
+                    mask = self._mask.copy()
+                else:
+                    mask = self._mask | mask
 
-            result[mask] = op_name == "ne"
-            return BooleanArray(result, np.zeros(len(result), dtype=bool), copy=False)
+            return BooleanArray(result, mask, copy=False)
 
         name = "__{name}__".format(name=op.__name__)
         return set_function_name(cmp_method, name, cls)
@@ -643,7 +652,7 @@ def _reduce(self, name, skipna=True, **kwargs):
         # coerce to a nan-aware float if needed
         if mask.any():
             data = self._data.astype("float64")
-            data[mask] = self._na_value
+            data[mask] = np.nan
 
         op = getattr(nanops, "nan" + name)
         result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)

diff --git a/pandas/tests/arrays/test_boolean.py b/pandas/tests/arrays/test_boolean.py
@@ -101,13 +101,14 @@ def test_to_boolean_array_all_none():
 @pytest.mark.parametrize(
     "a, b",
     [
-        ([True, None], [True, np.nan]),
-        ([None], [np.nan]),
-        ([None, np.nan], [np.nan, np.nan]),
-        ([np.nan, np.nan], [np.nan, np.nan]),
+        ([True, False, None, np.nan, pd.NA], [True, False, None, None, None]),
+        ([True, np.nan], [True, None]),
+        ([True, pd.NA], [True, None]),
+        ([np.nan, np.nan], [None, None]),
+        (np.array([np.nan, np.nan], dtype=float), [None, None]),
     ],
 )
-def test_to_boolean_array_none_is_nan(a, b):
+def test_to_boolean_array_missing_indicators(a, b):
     result = pd.array(a, dtype="boolean")
     expected = pd.array(b, dtype="boolean")
     tm.assert_extension_array_equal(result, expected)
@@ -216,7 +217,7 @@ def test_coerce_to_numpy_array():
     # with missing values -> object dtype
     arr = pd.array([True, False, None], dtype="boolean")
     result = np.array(arr)
-    expected = np.array([True, False, None], dtype="object")
+    expected = np.array([True, False, pd.NA], dtype="object")
     tm.assert_numpy_array_equal(result, expected)
 
     # also with no missing values -> object dtype
@@ -238,12 +239,11 @@ def test_coerce_to_numpy_array():
 def test_astype():
     # with missing values
     arr = pd.array([True, False, None], dtype="boolean")
-    msg = "cannot convert float NaN to"
 
-    with pytest.raises(ValueError, match=msg):
+    with pytest.raises(ValueError, match="cannot convert NA to integer"):
         arr.astype("int64")
 
-    with pytest.raises(ValueError, match=msg):
+    with pytest.raises(ValueError, match="cannot convert float NaN to"):
         arr.astype("bool")
 
     result = arr.astype("float64")
@@ -280,6 +280,14 @@ def test_astype_to_integer_array():
     tm.assert_extension_array_equal(result, expected)
 
 
+@pytest.mark.parametrize("na", [None, np.nan, pd.NA])
+def test_setitem_missing_values(na):
+    arr = pd.array([True, False, None], dtype="boolean")
+    expected = pd.array([True, None, None], dtype="boolean")
+    arr[1] = na
+    tm.assert_extension_array_equal(arr, expected)
+
+
 @pytest.mark.parametrize(
     "ufunc", [np.add, np.logical_or, np.logical_and, np.logical_xor]
 )
@@ -406,9 +414,8 @@ def _compare_other(self, data, op_name, other):
         # array
         result = pd.Series(op(data, other))
         expected = pd.Series(op(data._data, other), dtype="boolean")
-
-        # fill the nan locations
-        expected[data._mask] = op_name == "__ne__"
+        # propagate NAs
+        expected[data._mask] = pd.NA
 
         tm.assert_series_equal(result, expected)
 
@@ -419,9 +426,8 @@ def _compare_other(self, data, op_name, other):
         expected = pd.Series(data._data)
         expected = op(expected, other)
         expected = expected.astype("boolean")
-
-        # fill the nan locations
-        expected[data._mask] = op_name == "__ne__"
+        # propagate NAs
+        expected[data._mask] = pd.NA
 
         tm.assert_series_equal(result, expected)
 
@@ -438,6 +444,47 @@ def test_compare_array(self, data, all_compare_operators):
         other = pd.Series([True] * len(data))
         self._compare_other(data, op_name, other)
 
+    @pytest.mark.parametrize("other", [True, False, pd.NA])
+    def test_scalar(self, other, all_compare_operators):
+        op = self.get_op_from_name(all_compare_operators)
+        a = pd.array([True, False, None], dtype="boolean")
+
+        result = op(a, other)
+
+        if other is pd.NA:
+            expected = pd.array([None, None, None], dtype="boolean")
+        else:
+            values = op(a._data, other)
+            expected = BooleanArray(values, a._mask, copy=True)
+        tm.assert_extension_array_equal(result, expected)
+
+        # ensure we haven't mutated anything inplace
+        result[0] = None
+        tm.assert_extension_array_equal(
+            a, pd.array([True, False, None], dtype="boolean")
+        )
+
+    def test_array(self, all_compare_operators):
+        op = self.get_op_from_name(all_compare_operators)
+        a = pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
+        b = pd.array([True, False, None] * 3, dtype="boolean")
+
+        result = op(a, b)
+
+        values = op(a._data, b._data)
+        mask = a._mask | b._mask
+        expected = BooleanArray(values, mask)
+        tm.assert_extension_array_equal(result, expected)
+
+        # ensure we haven't mutated anything inplace
+        result[0] = None
+        tm.assert_extension_array_equal(
+            a, pd.array([True] * 3 + [False] * 3 + [None] * 3, dtype="boolean")
+        )
+        tm.assert_extension_array_equal(
+            b, pd.array([True, False, None] * 3, dtype="boolean")
+        )
+
 
 class TestArithmeticOps(BaseOpsUtil):
     def test_error(self, data, all_arithmetic_operators):

diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
@@ -60,13 +60,13 @@ def data_missing_for_sorting(dtype):
 
 @pytest.fixture
 def na_cmp():
-    # we are np.nan
-    return lambda x, y: np.isnan(x) and np.isnan(y)
+    # we are pd.NA
+    return lambda x, y: x is pd.NA and y is pd.NA
 
 
 @pytest.fixture
 def na_value():
-    return np.nan
+    return pd.NA
 
 
 @pytest.fixture
@@ -160,6 +160,14 @@ def check_opname(self, s, op_name, other, exc=None):
     def _compare_other(self, s, data, op_name, other):
         self.check_opname(s, op_name, other)
 
+    @pytest.mark.skip(reason="Tested in tests/arrays/test_boolean.py")
+    def test_compare_scalar(self, data, all_compare_operators):
+        pass
+
+    @pytest.mark.skip(reason="Tested in tests/arrays/test_boolean.py")
+    def test_compare_array(self, data, all_compare_operators):
+        pass
+
 
 class TestReshaping(base.BaseReshapingTests):
     pass

diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py
@@ -48,15 +48,15 @@ def test_arithmetic_ops(all_arithmetic_functions):
 
 def test_comparison_ops():
 
-    for other in [NA, 1, 1.0, "a", np.int64(1), np.nan]:
+    for other in [NA, 1, 1.0, "a", np.int64(1), np.nan, np.bool_(True)]:
         assert (NA == other) is NA
         assert (NA != other) is NA
         assert (NA > other) is NA
         assert (NA >= other) is NA
         assert (NA < other) is NA
         assert (NA <= other) is NA
 
-        if isinstance(other, np.int64):
+        if isinstance(other, (np.int64, np.bool_)):
             # for numpy scalars we get a deprecation warning and False as result
             # for equality or error for larger/lesser than
             continue