BUG: DataFrame(floatdata, dtype=inty) does unsafe casting (#41578)

pandas-dev · May 21, 2021 · 6dff995 · 6dff995
1 parent 7424f8a
commit 6dff995
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 9 deletions.
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -834,6 +834,7 @@ Missing
 - Bug in :func:`isna`, and :meth:`Series.isna`, :meth:`Index.isna`, :meth:`DataFrame.isna` (and the corresponding ``notna`` functions) not recognizing ``Decimal("NaN")`` objects (:issue:`39409`)
 - Bug in :meth:`DataFrame.fillna` not accepting dictionary for ``downcast`` keyword (:issue:`40809`)
 - Bug in :func:`isna` not returning a copy of the mask for nullable types, causing any subsequent mask modification to change the original array (:issue:`40935`)
+- Bug in :class:`DataFrame` construction with float data containing ``NaN`` and an integer ``dtype`` casting instead of retaining the ``NaN`` (:issue:`26919`)
 
 MultiIndex
 ^^^^^^^^^^

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -40,6 +40,7 @@
     DtypeObj,
     Scalar,
 )
+from pandas.errors import IntCastingNaNError
 from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import validate_bool_kwarg
 
@@ -1167,9 +1168,7 @@ def astype_nansafe(
         raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]")
 
     elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer):
-
-        if not np.isfinite(arr).all():
-            raise ValueError("Cannot convert non-finite values (NA or inf) to integer")
+        return astype_float_to_int_nansafe(arr, dtype, copy)
 
     elif is_object_dtype(arr):
 
@@ -1207,6 +1206,19 @@ def astype_nansafe(
     return arr.astype(dtype, copy=copy)
 
 
+def astype_float_to_int_nansafe(
+    values: np.ndarray, dtype: np.dtype, copy: bool
+) -> np.ndarray:
+    """
+    astype with a check preventing converting NaN to an meaningless integer value.
+    """
+    if not np.isfinite(values).all():
+        raise IntCastingNaNError(
+            "Cannot convert non-finite values (NA or inf) to integer"
+        )
+    return values.astype(dtype, copy=copy)
+
+
 def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike:
     """
     Cast array (ndarray or ExtensionArray) to the new dtype.
@@ -1946,6 +1958,17 @@ def construct_1d_ndarray_preserving_na(
         ):
             # TODO(numpy#12550): special-case can be removed
             subarr = construct_1d_object_array_from_listlike(list(values))
+        elif (
+            dtype is not None
+            and dtype.kind in ["i", "u"]
+            and isinstance(values, np.ndarray)
+            and values.dtype.kind == "f"
+        ):
+            # Argument 2 to "astype_float_to_int_nansafe" has incompatible
+            # type "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]"
+            return astype_float_to_int_nansafe(
+                values, dtype, copy=copy  # type: ignore[arg-type]
+            )
         else:
             # error: Argument "dtype" to "array" has incompatible type
             # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[dtype[Any],

diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -21,6 +21,7 @@
     DtypeObj,
     Manager,
 )
+from pandas.errors import IntCastingNaNError
 
 from pandas.core.dtypes.cast import (
     construct_1d_arraylike_from_scalar,
@@ -315,10 +316,11 @@ def ndarray_to_mgr(
                 values = construct_1d_ndarray_preserving_na(
                     flat, dtype=dtype, copy=False
                 )
-            except Exception as err:
-                # e.g. ValueError when trying to cast object dtype to float64
-                msg = f"failed to cast to '{dtype}' (Exception was: {err})"
-                raise ValueError(msg) from err
+            except IntCastingNaNError:
+                # following Series, we ignore the dtype and retain floating
+                # values instead of casting nans to meaningless ints
+                pass
+
         values = values.reshape(shape)
 
     # _prep_ndarray ensures that values.ndim == 2 at this point

diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py
@@ -12,6 +12,15 @@
 )
 
 
+class IntCastingNaNError(ValueError):
+    """
+    raised when attempting an astype operation on an array with NaN to an integer
+    dtype.
+    """
+
+    pass
+
+
 class NullFrequencyError(ValueError):
     """
     Error raised when a null `freq` attribute is used in an operation

diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -66,6 +66,18 @@
 
 
 class TestDataFrameConstructors:
+    def test_construct_ndarray_with_nas_and_int_dtype(self):
+        # GH#26919 match Series by not casting np.nan to meaningless int
+        arr = np.array([[1, np.nan], [2, 3]])
+        df = DataFrame(arr, dtype="i8")
+        assert df.values.dtype == arr.dtype
+        assert isna(df.iloc[0, 1])
+
+        # check this matches Series behavior
+        ser = Series(arr[0], dtype="i8", name=0)
+        expected = df.iloc[0]
+        tm.assert_series_equal(ser, expected)
+
     def test_construct_from_list_of_datetimes(self):
         df = DataFrame([datetime.now(), datetime.now()])
         assert df[0].dtype == np.dtype("M8[ns]")
@@ -851,9 +863,17 @@ def _check_basic_constructor(self, empty):
         assert len(frame.index) == 3
         assert len(frame.columns) == 1
 
-        # cast type
         frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
-        assert frame.values.dtype == np.int64
+        if empty is np.ones:
+            # passing dtype casts
+            assert frame.values.dtype == np.int64
+        else:
+            # i.e. ma.masked_all
+            # Since we have NaNs, refuse to cast to int dtype, which would take NaN
+            #  to meaningless integers.  This matches Series behavior.  GH#26919
+            assert frame.isna().all().all()
+            assert frame.values.dtype == np.float64
+            assert isna(frame.values).all()
 
         # wrong size axis labels
         msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"