Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: DataFrame(floatdata, dtype=inty) does unsafe casting #41578

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -808,6 +808,7 @@ Missing
- Bug in :func:`isna`, and :meth:`Series.isna`, :meth:`Index.isna`, :meth:`DataFrame.isna` (and the corresponding ``notna`` functions) not recognizing ``Decimal("NaN")`` objects (:issue:`39409`)
- Bug in :meth:`DataFrame.fillna` not accepting dictionary for ``downcast`` keyword (:issue:`40809`)
- Bug in :func:`isna` not returning a copy of the mask for nullable types, causing any subsequent mask modification to change the original array (:issue:`40935`)
- Bug in :class:`DataFrame` construction with float data containing ``NaN`` and an integer ``dtype`` casting instead of retaining the ``NaN`` (:issue:`26919`)

MultiIndex
^^^^^^^^^^
Expand Down
29 changes: 26 additions & 3 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
DtypeObj,
Scalar,
)
from pandas.errors import IntCastingNaNError
from pandas.util._exceptions import find_stack_level
from pandas.util._validators import validate_bool_kwarg

Expand Down Expand Up @@ -1167,9 +1168,7 @@ def astype_nansafe(
raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]")

elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer):

if not np.isfinite(arr).all():
raise ValueError("Cannot convert non-finite values (NA or inf) to integer")
return astype_float_to_int_nansafe(arr, dtype, copy)

elif is_object_dtype(arr):

Expand Down Expand Up @@ -1207,6 +1206,19 @@ def astype_nansafe(
return arr.astype(dtype, copy=copy)


def astype_float_to_int_nansafe(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is already done in astype_nansafe and IIRC in Index constructor as well. can you consolidate.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is already done in astype_nansafe

yes, this changes the call in astype_nansafe to use this new function, so that we can use this lighter-weight func in construct_1d_ndarray_preserving_na

and IIRC in Index constructor as well. can you consolidate.

similar but not identical (we also do it a third way for IntegerArray). some combination of #40489 and #40110 i think will make sharing feasible

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kk great. ok happy to merge here and do followsup.

values: np.ndarray, dtype: np.dtype, copy: bool
) -> np.ndarray:
"""
astype with a check preventing converting NaN to an meaningless integer value.
"""
if not np.isfinite(values).all():
raise IntCastingNaNError(
"Cannot convert non-finite values (NA or inf) to integer"
)
return values.astype(dtype, copy=copy)


def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike:
"""
Cast array (ndarray or ExtensionArray) to the new dtype.
Expand Down Expand Up @@ -1946,6 +1958,17 @@ def construct_1d_ndarray_preserving_na(
):
# TODO(numpy#12550): special-case can be removed
subarr = construct_1d_object_array_from_listlike(list(values))
elif (
dtype is not None
and dtype.kind in ["i", "u"]
and isinstance(values, np.ndarray)
and values.dtype.kind == "f"
):
# Argument 2 to "astype_float_to_int_nansafe" has incompatible
# type "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]"
return astype_float_to_int_nansafe(
values, dtype, copy=copy # type: ignore[arg-type]
)
else:
# error: Argument "dtype" to "array" has incompatible type
# "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[dtype[Any],
Expand Down
10 changes: 6 additions & 4 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
DtypeObj,
Manager,
)
from pandas.errors import IntCastingNaNError

from pandas.core.dtypes.cast import (
construct_1d_arraylike_from_scalar,
Expand Down Expand Up @@ -315,10 +316,11 @@ def ndarray_to_mgr(
values = construct_1d_ndarray_preserving_na(
flat, dtype=dtype, copy=False
)
except Exception as err:
# e.g. ValueError when trying to cast object dtype to float64
msg = f"failed to cast to '{dtype}' (Exception was: {err})"
raise ValueError(msg) from err
except IntCastingNaNError:
# following Series, we ignore the dtype and retain floating
# values instead of casting nans to meaningless ints
pass

values = values.reshape(shape)

# _prep_ndarray ensures that values.ndim == 2 at this point
Expand Down
9 changes: 9 additions & 0 deletions pandas/errors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,15 @@
)


class IntCastingNaNError(ValueError):
"""
raised when attempting an astype operation on an array with NaN to an integer
dtype.
"""

pass


class NullFrequencyError(ValueError):
"""
Error raised when a null `freq` attribute is used in an operation
Expand Down
24 changes: 22 additions & 2 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,18 @@


class TestDataFrameConstructors:
def test_construct_ndarray_with_nas_and_int_dtype(self):
# GH#26919 match Series by not casting np.nan to meaningless int
arr = np.array([[1, np.nan], [2, 3]])
df = DataFrame(arr, dtype="i8")
assert df.values.dtype == arr.dtype
assert isna(df.iloc[0, 1])

# check this matches Series behavior
ser = Series(arr[0], dtype="i8", name=0)
expected = df.iloc[0]
tm.assert_series_equal(ser, expected)

def test_construct_from_list_of_datetimes(self):
df = DataFrame([datetime.now(), datetime.now()])
assert df[0].dtype == np.dtype("M8[ns]")
Expand Down Expand Up @@ -851,9 +863,17 @@ def _check_basic_constructor(self, empty):
assert len(frame.index) == 3
assert len(frame.columns) == 1

# cast type
frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
assert frame.values.dtype == np.int64
if empty is np.ones:
# passing dtype casts
assert frame.values.dtype == np.int64
else:
# i.e. ma.masked_all
# Since we have NaNs, refuse to cast to int dtype, which would take NaN
# to meaningless integers. This matches Series behavior. GH#26919
assert frame.isna().all().all()
assert frame.values.dtype == np.float64
assert isna(frame.values).all()

# wrong size axis labels
msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"
Expand Down