From f8ae8cd642aae5451521ab23011a24a7e41da5b5 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 19 May 2021 18:05:54 -0700 Subject: [PATCH 1/2] BUG: DataFrame(floatdata, dtype=inty) does unsafe casting --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/dtypes/cast.py | 25 ++++++++++++++++++++++--- pandas/core/internals/construction.py | 10 ++++++---- pandas/errors/__init__.py | 9 +++++++++ pandas/tests/frame/test_constructors.py | 24 ++++++++++++++++++++++-- 5 files changed, 60 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 1eb22436204a8..bc74b2722d29c 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -808,6 +808,7 @@ Missing - Bug in :func:`isna`, and :meth:`Series.isna`, :meth:`Index.isna`, :meth:`DataFrame.isna` (and the corresponding ``notna`` functions) not recognizing ``Decimal("NaN")`` objects (:issue:`39409`) - Bug in :meth:`DataFrame.fillna` not accepting dictionary for ``downcast`` keyword (:issue:`40809`) - Bug in :func:`isna` not returning a copy of the mask for nullable types, causing any subsequent mask modification to change the original array (:issue:`40935`) +- Bug in :class:`DataFrame` construction with float data containing ``NaN`` and an integer ``dtype`` casting instead of retaining the ``NaN`` (:issue:`26919`) MultiIndex ^^^^^^^^^^ diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 94cffe8fb840d..1e30b4311d20e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -40,6 +40,7 @@ DtypeObj, Scalar, ) +from pandas.errors import IntCastingNaNError from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_bool_kwarg @@ -1167,9 +1168,7 @@ def astype_nansafe( raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]") elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer): - - if not np.isfinite(arr).all(): - raise ValueError("Cannot convert non-finite values (NA or inf) to integer") + return astype_float_to_int_nansafe(arr, dtype, copy) elif is_object_dtype(arr): @@ -1207,6 +1206,19 @@ def astype_nansafe( return arr.astype(dtype, copy=copy) +def astype_float_to_int_nansafe( + values: np.ndarray, dtype: np.dtype, copy: bool +) -> np.ndarray: + """ + astype with a check preventing converting NaN to an meaningless integer value. + """ + if not np.isfinite(values).all(): + raise IntCastingNaNError( + "Cannot convert non-finite values (NA or inf) to integer" + ) + return values.astype(dtype, copy=copy) + + def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike: """ Cast array (ndarray or ExtensionArray) to the new dtype. @@ -1946,6 +1958,13 @@ def construct_1d_ndarray_preserving_na( ): # TODO(numpy#12550): special-case can be removed subarr = construct_1d_object_array_from_listlike(list(values)) + elif ( + dtype is not None + and dtype.kind in ["i", "u"] + and isinstance(values, np.ndarray) + and values.dtype.kind == "f" + ): + return astype_float_to_int_nansafe(values, dtype, copy=copy) else: # error: Argument "dtype" to "array" has incompatible type # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[dtype[Any], diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 06d30d6ed72e8..6baa1072a0396 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -21,6 +21,7 @@ DtypeObj, Manager, ) +from pandas.errors import IntCastingNaNError from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, @@ -315,10 +316,11 @@ def ndarray_to_mgr( values = construct_1d_ndarray_preserving_na( flat, dtype=dtype, copy=False ) - except Exception as err: - # e.g. ValueError when trying to cast object dtype to float64 - msg = f"failed to cast to '{dtype}' (Exception was: {err})" - raise ValueError(msg) from err + except IntCastingNaNError: + # following Series, we ignore the dtype and retain floating + # values instead of casting nans to meaningless ints + pass + values = values.reshape(shape) # _prep_ndarray ensures that values.ndim == 2 at this point diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index a0f6ddfd84d7b..92516a1609f10 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -12,6 +12,15 @@ ) +class IntCastingNaNError(ValueError): + """ + raised when attempting an astype operation on an array with NaN to an integer + dtype. + """ + + pass + + class NullFrequencyError(ValueError): """ Error raised when a null `freq` attribute is used in an operation diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 6e9991ff17ac3..94cb85a0945f2 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -66,6 +66,18 @@ class TestDataFrameConstructors: + def test_construct_ndarray_with_nas_and_int_dtype(self): + # GH#26919 match Series by not casting np.nan to meaningless int + arr = np.array([[1, np.nan], [2, 3]]) + df = DataFrame(arr, dtype="i8") + assert df.values.dtype == arr.dtype + assert isna(df.iloc[0, 1]) + + # check this matches Series behavior + ser = Series(arr[0], dtype="i8", name=0) + expected = df.iloc[0] + tm.assert_series_equal(ser, expected) + def test_construct_from_list_of_datetimes(self): df = DataFrame([datetime.now(), datetime.now()]) assert df[0].dtype == np.dtype("M8[ns]") @@ -851,9 +863,17 @@ def _check_basic_constructor(self, empty): assert len(frame.index) == 3 assert len(frame.columns) == 1 - # cast type frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64) - assert frame.values.dtype == np.int64 + if empty is np.ones: + # passing dtype casts + assert frame.values.dtype == np.int64 + else: + # i.e. ma.masked_all + # Since we have NaNs, refuse to cast to int dtype, which would take NaN + # to meaningless integers. This matches Series behavior. GH#26919 + assert frame.isna().all().all() + assert frame.values.dtype == np.float64 + assert isna(frame.values).all() # wrong size axis labels msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" From 8b40192bd8987c6d1ba5decf3871dddceb957269 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 19 May 2021 18:18:00 -0700 Subject: [PATCH 2/2] mypy fixup --- pandas/core/dtypes/cast.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 1e30b4311d20e..b4bdb2f9c4b53 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1964,7 +1964,11 @@ def construct_1d_ndarray_preserving_na( and isinstance(values, np.ndarray) and values.dtype.kind == "f" ): - return astype_float_to_int_nansafe(values, dtype, copy=copy) + # Argument 2 to "astype_float_to_int_nansafe" has incompatible + # type "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]" + return astype_float_to_int_nansafe( + values, dtype, copy=copy # type: ignore[arg-type] + ) else: # error: Argument "dtype" to "array" has incompatible type # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[dtype[Any],