Skip to content

Commit

Permalink
BUG: DataFrame(floatdata, dtype=inty) does unsafe casting (#41578)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel committed May 21, 2021
1 parent 7424f8a commit 6dff995
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 9 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -834,6 +834,7 @@ Missing
- Bug in :func:`isna`, and :meth:`Series.isna`, :meth:`Index.isna`, :meth:`DataFrame.isna` (and the corresponding ``notna`` functions) not recognizing ``Decimal("NaN")`` objects (:issue:`39409`)
- Bug in :meth:`DataFrame.fillna` not accepting dictionary for ``downcast`` keyword (:issue:`40809`)
- Bug in :func:`isna` not returning a copy of the mask for nullable types, causing any subsequent mask modification to change the original array (:issue:`40935`)
- Bug in :class:`DataFrame` construction with float data containing ``NaN`` and an integer ``dtype`` casting instead of retaining the ``NaN`` (:issue:`26919`)

MultiIndex
^^^^^^^^^^
Expand Down
29 changes: 26 additions & 3 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
DtypeObj,
Scalar,
)
from pandas.errors import IntCastingNaNError
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg

Expand Down Expand Up @@ -1167,9 +1168,7 @@ def astype_nansafe(
raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]")

elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer):

if not np.isfinite(arr).all():
raise ValueError("Cannot convert non-finite values (NA or inf) to integer")
return astype_float_to_int_nansafe(arr, dtype, copy)

elif is_object_dtype(arr):

Expand Down Expand Up @@ -1207,6 +1206,19 @@ def astype_nansafe(
return arr.astype(dtype, copy=copy)


def astype_float_to_int_nansafe(
values: np.ndarray, dtype: np.dtype, copy: bool
) -> np.ndarray:
"""
astype with a check preventing converting NaN to an meaningless integer value.
"""
if not np.isfinite(values).all():
raise IntCastingNaNError(
"Cannot convert non-finite values (NA or inf) to integer"
)
return values.astype(dtype, copy=copy)


def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike:
"""
Cast array (ndarray or ExtensionArray) to the new dtype.
Expand Down Expand Up @@ -1946,6 +1958,17 @@ def construct_1d_ndarray_preserving_na(
):
# TODO(numpy#12550): special-case can be removed
subarr = construct_1d_object_array_from_listlike(list(values))
elif (
dtype is not None
and dtype.kind in ["i", "u"]
and isinstance(values, np.ndarray)
and values.dtype.kind == "f"
):
# Argument 2 to "astype_float_to_int_nansafe" has incompatible
# type "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]"
return astype_float_to_int_nansafe(
values, dtype, copy=copy # type: ignore[arg-type]
)
else:
# error: Argument "dtype" to "array" has incompatible type
# "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[dtype[Any],
Expand Down
10 changes: 6 additions & 4 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
DtypeObj,
Manager,
)
from pandas.errors import IntCastingNaNError

from pandas.core.dtypes.cast import (
construct_1d_arraylike_from_scalar,
Expand Down Expand Up @@ -315,10 +316,11 @@ def ndarray_to_mgr(
values = construct_1d_ndarray_preserving_na(
flat, dtype=dtype, copy=False
)
except Exception as err:
# e.g. ValueError when trying to cast object dtype to float64
msg = f"failed to cast to '{dtype}' (Exception was: {err})"
raise ValueError(msg) from err
except IntCastingNaNError:
# following Series, we ignore the dtype and retain floating
# values instead of casting nans to meaningless ints
pass

values = values.reshape(shape)

# _prep_ndarray ensures that values.ndim == 2 at this point
Expand Down
9 changes: 9 additions & 0 deletions pandas/errors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@
)


class IntCastingNaNError(ValueError):
"""
raised when attempting an astype operation on an array with NaN to an integer
dtype.
"""

pass


class NullFrequencyError(ValueError):
"""
Error raised when a null `freq` attribute is used in an operation
Expand Down
24 changes: 22 additions & 2 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,18 @@


class TestDataFrameConstructors:
def test_construct_ndarray_with_nas_and_int_dtype(self):
# GH#26919 match Series by not casting np.nan to meaningless int
arr = np.array([[1, np.nan], [2, 3]])
df = DataFrame(arr, dtype="i8")
assert df.values.dtype == arr.dtype
assert isna(df.iloc[0, 1])

# check this matches Series behavior
ser = Series(arr[0], dtype="i8", name=0)
expected = df.iloc[0]
tm.assert_series_equal(ser, expected)

def test_construct_from_list_of_datetimes(self):
df = DataFrame([datetime.now(), datetime.now()])
assert df[0].dtype == np.dtype("M8[ns]")
Expand Down Expand Up @@ -851,9 +863,17 @@ def _check_basic_constructor(self, empty):
assert len(frame.index) == 3
assert len(frame.columns) == 1

# cast type
frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
assert frame.values.dtype == np.int64
if empty is np.ones:
# passing dtype casts
assert frame.values.dtype == np.int64
else:
# i.e. ma.masked_all
# Since we have NaNs, refuse to cast to int dtype, which would take NaN
# to meaningless integers. This matches Series behavior. GH#26919
assert frame.isna().all().all()
assert frame.values.dtype == np.float64
assert isna(frame.values).all()

# wrong size axis labels
msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"
Expand Down

0 comments on commit 6dff995

Please sign in to comment.