Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DEPR: stop inferring dt64/td64 from strings in Series construtor #49319

Merged
merged 2 commits into from
Oct 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ Removal of prior version deprecations/changes
- Removed the ``display.column_space`` option in favor of ``df.to_string(col_space=...)`` (:issue:`47280`)
- Removed the deprecated method ``mad`` from pandas classes (:issue:`11787`)
- Removed the deprecated method ``tshift`` from pandas classes (:issue:`11631`)
- Changed the behavior of :class:`Series` constructor, it will no longer infer a datetime64 or timedelta64 dtype from string entries (:issue:`41731`)
- Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`)

.. ---------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def ensure_string_array(
) -> npt.NDArray[np.object_]: ...
def infer_datetimelike_array(
arr: npt.NDArray[np.object_],
) -> tuple[str, bool]: ...
) -> str: ...
def convert_nans_to_NA(
arr: npt.NDArray[np.object_],
) -> npt.NDArray[np.object_]: ...
Expand Down
50 changes: 15 additions & 35 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,6 @@ from pandas._libs.util cimport (
is_nan,
)

from pandas._libs.tslib import array_to_datetime
from pandas._libs.tslibs import (
OutOfBoundsDatetime,
OutOfBoundsTimedelta,
Expand Down Expand Up @@ -1583,25 +1582,19 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
Returns
-------
str: {datetime, timedelta, date, nat, mixed}
bool
"""
cdef:
Py_ssize_t i, n = len(arr)
bint seen_timedelta = False, seen_date = False, seen_datetime = False
bint seen_tz_aware = False, seen_tz_naive = False
bint seen_nat = False, seen_str = False
bint seen_nat = False
bint seen_period = False, seen_interval = False
list objs = []
object v

for i in range(n):
v = arr[i]
if isinstance(v, str):
objs.append(v)
seen_str = True

if len(objs) == 3:
break
return "mixed"

elif v is None or util.is_nan(v):
# nan or None
Expand All @@ -1619,7 +1612,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
seen_tz_aware = True

if seen_tz_naive and seen_tz_aware:
return "mixed", seen_str
return "mixed"
elif util.is_datetime64_object(v):
# np.datetime64
seen_datetime = True
Expand All @@ -1635,43 +1628,30 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
seen_interval = True
break
else:
return "mixed", seen_str
return "mixed"

if seen_period:
if is_period_array(arr):
return "period", seen_str
return "mixed", seen_str
return "period"
return "mixed"

if seen_interval:
if is_interval_array(arr):
return "interval", seen_str
return "mixed", seen_str
return "interval"
return "mixed"

if seen_date and not (seen_datetime or seen_timedelta):
return "date", seen_str
return "date"
elif seen_datetime and not seen_timedelta:
return "datetime", seen_str
return "datetime"
elif seen_timedelta and not seen_datetime:
return "timedelta", seen_str
return "timedelta"
elif seen_datetime and seen_timedelta:
return "mixed"
elif seen_nat:
return "nat", seen_str
return "nat"

# short-circuit by trying to
# actually convert these strings
# this is for performance as we don't need to try
# convert *every* string array
if len(objs):
try:
# require_iso8601 as in maybe_infer_to_datetimelike
array_to_datetime(objs, errors="raise", require_iso8601=True)
return "datetime", seen_str
except (ValueError, TypeError):
pass

# we are *not* going to infer from strings
# for timedelta as too much ambiguity

return "mixed", seen_str
return "mixed"


cdef inline bint is_timedelta(object o):
Expand Down
15 changes: 4 additions & 11 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1264,7 +1264,9 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
else:
return td_values.reshape(shape)

inferred_type, seen_str = lib.infer_datetimelike_array(ensure_object(v))
# TODO: can we just do lib.maybe_convert_objects for this entire function?
inferred_type = lib.infer_datetimelike_array(ensure_object(v))

if inferred_type in ["period", "interval"]:
# Incompatible return value type (got "Union[ExtensionArray, ndarray]",
# expected "Union[ndarray, DatetimeArray, TimedeltaArray, PeriodArray,
Expand All @@ -1280,14 +1282,14 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
elif inferred_type == "timedelta":
value = try_timedelta(v)
elif inferred_type == "nat":
# only reached if we have at least 1 NaT and the rest (NaT or None or np.nan)

# if all NaT, return as datetime
if isna(v).all():
# error: Incompatible types in assignment (expression has type
# "ExtensionArray", variable has type "Union[ndarray, List[Any]]")
value = try_datetime(v) # type: ignore[assignment]
else:

# We have at least a NaT and a string
# try timedelta first to avoid spurious datetime conversions
# e.g. '00:00:01' is a timedelta but technically is also a datetime
Expand All @@ -1300,15 +1302,6 @@ def try_timedelta(v: np.ndarray) -> np.ndarray:
# "ExtensionArray", variable has type "Union[ndarray, List[Any]]")
value = try_datetime(v) # type: ignore[assignment]

if value.dtype.kind in ["m", "M"] and seen_str:
# TODO(2.0): enforcing this deprecation should close GH#40111
warnings.warn(
f"Inferring {value.dtype} from data containing strings is deprecated "
"and will be removed in a future version. To retain the old behavior "
f"explicitly pass Series(data, dtype={value.dtype})",
FutureWarning,
stacklevel=find_stack_level(),
)
return value


Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,7 +775,7 @@ def _infer_types(
result = BooleanArray(result, bool_mask)
elif result.dtype == np.object_ and use_nullable_dtypes:
# read_excel sends array of datetime objects
inferred_type, _ = lib.infer_datetimelike_array(result)
inferred_type = lib.infer_datetimelike_array(result)
if inferred_type != "datetime":
result = StringDtype().construct_array_type()._from_sequence(values)

Expand Down
3 changes: 1 addition & 2 deletions pandas/tests/apply/test_series_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -859,8 +859,7 @@ def test_apply_to_timedelta():
list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT]

a = pd.to_timedelta(list_of_strings)
with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"):
ser = Series(list_of_strings)
ser = Series(list_of_strings)
b = ser.apply(pd.to_timedelta)
tm.assert_series_equal(Series(a), b)

Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/dtypes/test_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -1346,7 +1346,7 @@ def test_infer_dtype_period_with_na(self, na_value):
],
)
def test_infer_datetimelike_array_datetime(self, data):
assert lib.infer_datetimelike_array(data) == ("datetime", False)
assert lib.infer_datetimelike_array(data) == "datetime"

@pytest.mark.parametrize(
"data",
Expand All @@ -1358,11 +1358,11 @@ def test_infer_datetimelike_array_datetime(self, data):
],
)
def test_infer_datetimelike_array_timedelta(self, data):
assert lib.infer_datetimelike_array(data) == ("timedelta", False)
assert lib.infer_datetimelike_array(data) == "timedelta"

def test_infer_datetimelike_array_date(self):
arr = [date(2017, 6, 12), date(2017, 3, 11)]
assert lib.infer_datetimelike_array(arr) == ("date", False)
assert lib.infer_datetimelike_array(arr) == "date"

@pytest.mark.parametrize(
"data",
Expand All @@ -1377,7 +1377,7 @@ def test_infer_datetimelike_array_date(self):
],
)
def test_infer_datetimelike_array_mixed(self, data):
assert lib.infer_datetimelike_array(data)[0] == "mixed"
assert lib.infer_datetimelike_array(data) == "mixed"

@pytest.mark.parametrize(
"first, expected",
Expand All @@ -1395,7 +1395,7 @@ def test_infer_datetimelike_array_mixed(self, data):
@pytest.mark.parametrize("second", [None, np.nan])
def test_infer_datetimelike_array_nan_nat_like(self, first, second, expected):
first.append(second)
assert lib.infer_datetimelike_array(first) == (expected, False)
assert lib.infer_datetimelike_array(first) == expected

def test_infer_dtype_all_nan_nat_like(self):
arr = np.array([np.nan, np.nan])
Expand Down
44 changes: 21 additions & 23 deletions pandas/tests/resample/test_time_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,29 +321,27 @@ def test_groupby_resample_interpolate():
.interpolate(method="linear")
)

msg = "containing strings is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
expected_ind = pd.MultiIndex.from_tuples(
[
(50, "2018-01-07"),
(50, Timestamp("2018-01-08")),
(50, Timestamp("2018-01-09")),
(50, Timestamp("2018-01-10")),
(50, Timestamp("2018-01-11")),
(50, Timestamp("2018-01-12")),
(50, Timestamp("2018-01-13")),
(50, Timestamp("2018-01-14")),
(50, Timestamp("2018-01-15")),
(50, Timestamp("2018-01-16")),
(50, Timestamp("2018-01-17")),
(50, Timestamp("2018-01-18")),
(50, Timestamp("2018-01-19")),
(50, Timestamp("2018-01-20")),
(50, Timestamp("2018-01-21")),
(60, Timestamp("2018-01-14")),
],
names=["volume", "week_starting"],
)
expected_ind = pd.MultiIndex.from_tuples(
[
(50, Timestamp("2018-01-07")),
(50, Timestamp("2018-01-08")),
(50, Timestamp("2018-01-09")),
(50, Timestamp("2018-01-10")),
(50, Timestamp("2018-01-11")),
(50, Timestamp("2018-01-12")),
(50, Timestamp("2018-01-13")),
(50, Timestamp("2018-01-14")),
(50, Timestamp("2018-01-15")),
(50, Timestamp("2018-01-16")),
(50, Timestamp("2018-01-17")),
(50, Timestamp("2018-01-18")),
(50, Timestamp("2018-01-19")),
(50, Timestamp("2018-01-20")),
(50, Timestamp("2018-01-21")),
(60, Timestamp("2018-01-14")),
],
names=["volume", "week_starting"],
)

expected = DataFrame(
data={
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/series/methods/test_combine_first.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,7 @@ def test_combine_first_dt64(self):
s1 = Series([np.NaN, "2011"])
rs = s0.combine_first(s1)

msg = "containing strings is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
xp = Series([datetime(2010, 1, 1), "2011"])
xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]")

tm.assert_series_equal(rs, xp)

Expand Down
5 changes: 1 addition & 4 deletions pandas/tests/series/methods/test_fillna.py
Original file line number Diff line number Diff line change
Expand Up @@ -365,10 +365,7 @@ def test_datetime64_fillna(self):
def test_datetime64_fillna_backfill(self):
# GH#6587
# make sure that we are treating as integer when filling
msg = "containing strings is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
# this also tests inference of a datetime-like with NaT's
ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"])
ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"], dtype="M8[ns]")

expected = Series(
[
Expand Down
48 changes: 20 additions & 28 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -1018,24 +1018,20 @@ def test_constructor_dtype_datetime64_7(self):
assert series1.dtype == object

def test_constructor_dtype_datetime64_6(self):
# these will correctly infer a datetime
msg = "containing strings is deprecated"
# as of 2.0, these no longer infer datetime64 based on the strings,
# matching the Index behavior

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([None, NaT, "2013-08-05 15:30:00.000001"])
assert ser.dtype == "datetime64[ns]"
ser = Series([None, NaT, "2013-08-05 15:30:00.000001"])
assert ser.dtype == object

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"])
assert ser.dtype == "datetime64[ns]"
ser = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"])
assert ser.dtype == object

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([NaT, None, "2013-08-05 15:30:00.000001"])
assert ser.dtype == "datetime64[ns]"
ser = Series([NaT, None, "2013-08-05 15:30:00.000001"])
assert ser.dtype == object

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"])
assert ser.dtype == "datetime64[ns]"
ser = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"])
assert ser.dtype == object

def test_constructor_dtype_datetime64_5(self):
# tz-aware (UTC and other tz's)
Expand Down Expand Up @@ -1517,23 +1513,19 @@ def test_constructor_dtype_timedelta64(self):
td = Series([timedelta(days=i) for i in range(3)] + ["foo"])
assert td.dtype == "object"

# these will correctly infer a timedelta
msg = "containing strings is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([None, NaT, "1 Day"])
assert ser.dtype == "timedelta64[ns]"
# as of 2.0, these no longer infer timedelta64 based on the strings,
# matching Index behavior
ser = Series([None, NaT, "1 Day"])
assert ser.dtype == object

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([np.nan, NaT, "1 Day"])
assert ser.dtype == "timedelta64[ns]"
ser = Series([np.nan, NaT, "1 Day"])
assert ser.dtype == object

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([NaT, None, "1 Day"])
assert ser.dtype == "timedelta64[ns]"
ser = Series([NaT, None, "1 Day"])
assert ser.dtype == object

with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series([NaT, np.nan, "1 Day"])
assert ser.dtype == "timedelta64[ns]"
ser = Series([NaT, np.nan, "1 Day"])
assert ser.dtype == object

# GH 16406
def test_constructor_mixed_tz(self):
Expand Down
4 changes: 1 addition & 3 deletions pandas/tests/tools/test_to_timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,7 @@ def test_to_timedelta_on_missing_values(self):
)
tm.assert_series_equal(actual, expected)

with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"):
ser = Series(["00:00:01", pd.NaT])
assert ser.dtype == "m8[ns]"
ser = Series(["00:00:01", pd.NaT], dtype="m8[ns]")
actual = to_timedelta(ser)
tm.assert_series_equal(actual, expected)

Expand Down