From 4cc2220d5551609632823e7ac66448843c6d5f83 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 26 Oct 2022 10:00:02 -0700 Subject: [PATCH] DEPR: stop inferring dt64/td64 from strings in Series construtor (#49319) * DEPR: stop inferring dt64/td64 from strings in Series construtor * update pyi --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/lib.pyi | 2 +- pandas/_libs/lib.pyx | 50 ++++++------------- pandas/core/dtypes/cast.py | 15 ++---- pandas/io/parsers/base_parser.py | 2 +- pandas/tests/apply/test_series_apply.py | 3 +- pandas/tests/dtypes/test_inference.py | 10 ++-- pandas/tests/resample/test_time_grouper.py | 44 ++++++++-------- .../series/methods/test_combine_first.py | 4 +- pandas/tests/series/methods/test_fillna.py | 5 +- pandas/tests/series/test_constructors.py | 48 ++++++++---------- pandas/tests/tools/test_to_timedelta.py | 4 +- 12 files changed, 72 insertions(+), 116 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index c068b2733cc7a..3cdeff9ad4ca0 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -238,6 +238,7 @@ Removal of prior version deprecations/changes - Removed the ``display.column_space`` option in favor of ``df.to_string(col_space=...)`` (:issue:`47280`) - Removed the deprecated method ``mad`` from pandas classes (:issue:`11787`) - Removed the deprecated method ``tshift`` from pandas classes (:issue:`11631`) +- Changed the behavior of :class:`Series` constructor, it will no longer infer a datetime64 or timedelta64 dtype from string entries (:issue:`41731`) - Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`) .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 244d1dbe4730e..6d7f895f7f730 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -158,7 +158,7 @@ def ensure_string_array( ) -> npt.NDArray[np.object_]: ... def infer_datetimelike_array( arr: npt.NDArray[np.object_], -) -> tuple[str, bool]: ... +) -> str: ... def convert_nans_to_NA( arr: npt.NDArray[np.object_], ) -> npt.NDArray[np.object_]: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4151ba927adf0..188b531b2b469 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -95,7 +95,6 @@ from pandas._libs.util cimport ( is_nan, ) -from pandas._libs.tslib import array_to_datetime from pandas._libs.tslibs import ( OutOfBoundsDatetime, OutOfBoundsTimedelta, @@ -1583,25 +1582,19 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]: Returns ------- str: {datetime, timedelta, date, nat, mixed} - bool """ cdef: Py_ssize_t i, n = len(arr) bint seen_timedelta = False, seen_date = False, seen_datetime = False bint seen_tz_aware = False, seen_tz_naive = False - bint seen_nat = False, seen_str = False + bint seen_nat = False bint seen_period = False, seen_interval = False - list objs = [] object v for i in range(n): v = arr[i] if isinstance(v, str): - objs.append(v) - seen_str = True - - if len(objs) == 3: - break + return "mixed" elif v is None or util.is_nan(v): # nan or None @@ -1619,7 +1612,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]: seen_tz_aware = True if seen_tz_naive and seen_tz_aware: - return "mixed", seen_str + return "mixed" elif util.is_datetime64_object(v): # np.datetime64 seen_datetime = True @@ -1635,43 +1628,30 @@ def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]: seen_interval = True break else: - return "mixed", seen_str + return "mixed" if seen_period: if is_period_array(arr): - return "period", seen_str - return "mixed", seen_str + return "period" + return "mixed" if seen_interval: if is_interval_array(arr): - return "interval", seen_str - return "mixed", seen_str + return "interval" + return "mixed" if seen_date and not (seen_datetime or seen_timedelta): - return "date", seen_str + return "date" elif seen_datetime and not seen_timedelta: - return "datetime", seen_str + return "datetime" elif seen_timedelta and not seen_datetime: - return "timedelta", seen_str + return "timedelta" + elif seen_datetime and seen_timedelta: + return "mixed" elif seen_nat: - return "nat", seen_str + return "nat" - # short-circuit by trying to - # actually convert these strings - # this is for performance as we don't need to try - # convert *every* string array - if len(objs): - try: - # require_iso8601 as in maybe_infer_to_datetimelike - array_to_datetime(objs, errors="raise", require_iso8601=True) - return "datetime", seen_str - except (ValueError, TypeError): - pass - - # we are *not* going to infer from strings - # for timedelta as too much ambiguity - - return "mixed", seen_str + return "mixed" cdef inline bint is_timedelta(object o): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index ec313f91d2721..cabd85aed1bbe 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1264,7 +1264,9 @@ def try_timedelta(v: np.ndarray) -> np.ndarray: else: return td_values.reshape(shape) - inferred_type, seen_str = lib.infer_datetimelike_array(ensure_object(v)) + # TODO: can we just do lib.maybe_convert_objects for this entire function? + inferred_type = lib.infer_datetimelike_array(ensure_object(v)) + if inferred_type in ["period", "interval"]: # Incompatible return value type (got "Union[ExtensionArray, ndarray]", # expected "Union[ndarray, DatetimeArray, TimedeltaArray, PeriodArray, @@ -1280,6 +1282,7 @@ def try_timedelta(v: np.ndarray) -> np.ndarray: elif inferred_type == "timedelta": value = try_timedelta(v) elif inferred_type == "nat": + # only reached if we have at least 1 NaT and the rest (NaT or None or np.nan) # if all NaT, return as datetime if isna(v).all(): @@ -1287,7 +1290,6 @@ def try_timedelta(v: np.ndarray) -> np.ndarray: # "ExtensionArray", variable has type "Union[ndarray, List[Any]]") value = try_datetime(v) # type: ignore[assignment] else: - # We have at least a NaT and a string # try timedelta first to avoid spurious datetime conversions # e.g. '00:00:01' is a timedelta but technically is also a datetime @@ -1300,15 +1302,6 @@ def try_timedelta(v: np.ndarray) -> np.ndarray: # "ExtensionArray", variable has type "Union[ndarray, List[Any]]") value = try_datetime(v) # type: ignore[assignment] - if value.dtype.kind in ["m", "M"] and seen_str: - # TODO(2.0): enforcing this deprecation should close GH#40111 - warnings.warn( - f"Inferring {value.dtype} from data containing strings is deprecated " - "and will be removed in a future version. To retain the old behavior " - f"explicitly pass Series(data, dtype={value.dtype})", - FutureWarning, - stacklevel=find_stack_level(), - ) return value diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 44773f13276c0..5b54ac56d48c8 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -775,7 +775,7 @@ def _infer_types( result = BooleanArray(result, bool_mask) elif result.dtype == np.object_ and use_nullable_dtypes: # read_excel sends array of datetime objects - inferred_type, _ = lib.infer_datetimelike_array(result) + inferred_type = lib.infer_datetimelike_array(result) if inferred_type != "datetime": result = StringDtype().construct_array_type()._from_sequence(values) diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py index 5221c41ce35d5..b67af8c521090 100644 --- a/pandas/tests/apply/test_series_apply.py +++ b/pandas/tests/apply/test_series_apply.py @@ -859,8 +859,7 @@ def test_apply_to_timedelta(): list_of_strings = ["00:00:01", np.nan, pd.NaT, pd.NaT] a = pd.to_timedelta(list_of_strings) - with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"): - ser = Series(list_of_strings) + ser = Series(list_of_strings) b = ser.apply(pd.to_timedelta) tm.assert_series_equal(Series(a), b) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 56c97ac7a4dc5..e1d16fed73a88 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -1346,7 +1346,7 @@ def test_infer_dtype_period_with_na(self, na_value): ], ) def test_infer_datetimelike_array_datetime(self, data): - assert lib.infer_datetimelike_array(data) == ("datetime", False) + assert lib.infer_datetimelike_array(data) == "datetime" @pytest.mark.parametrize( "data", @@ -1358,11 +1358,11 @@ def test_infer_datetimelike_array_datetime(self, data): ], ) def test_infer_datetimelike_array_timedelta(self, data): - assert lib.infer_datetimelike_array(data) == ("timedelta", False) + assert lib.infer_datetimelike_array(data) == "timedelta" def test_infer_datetimelike_array_date(self): arr = [date(2017, 6, 12), date(2017, 3, 11)] - assert lib.infer_datetimelike_array(arr) == ("date", False) + assert lib.infer_datetimelike_array(arr) == "date" @pytest.mark.parametrize( "data", @@ -1377,7 +1377,7 @@ def test_infer_datetimelike_array_date(self): ], ) def test_infer_datetimelike_array_mixed(self, data): - assert lib.infer_datetimelike_array(data)[0] == "mixed" + assert lib.infer_datetimelike_array(data) == "mixed" @pytest.mark.parametrize( "first, expected", @@ -1395,7 +1395,7 @@ def test_infer_datetimelike_array_mixed(self, data): @pytest.mark.parametrize("second", [None, np.nan]) def test_infer_datetimelike_array_nan_nat_like(self, first, second, expected): first.append(second) - assert lib.infer_datetimelike_array(first) == (expected, False) + assert lib.infer_datetimelike_array(first) == expected def test_infer_dtype_all_nan_nat_like(self): arr = np.array([np.nan, np.nan]) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 4498f11d77313..7ec3c81de235c 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -321,29 +321,27 @@ def test_groupby_resample_interpolate(): .interpolate(method="linear") ) - msg = "containing strings is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - expected_ind = pd.MultiIndex.from_tuples( - [ - (50, "2018-01-07"), - (50, Timestamp("2018-01-08")), - (50, Timestamp("2018-01-09")), - (50, Timestamp("2018-01-10")), - (50, Timestamp("2018-01-11")), - (50, Timestamp("2018-01-12")), - (50, Timestamp("2018-01-13")), - (50, Timestamp("2018-01-14")), - (50, Timestamp("2018-01-15")), - (50, Timestamp("2018-01-16")), - (50, Timestamp("2018-01-17")), - (50, Timestamp("2018-01-18")), - (50, Timestamp("2018-01-19")), - (50, Timestamp("2018-01-20")), - (50, Timestamp("2018-01-21")), - (60, Timestamp("2018-01-14")), - ], - names=["volume", "week_starting"], - ) + expected_ind = pd.MultiIndex.from_tuples( + [ + (50, Timestamp("2018-01-07")), + (50, Timestamp("2018-01-08")), + (50, Timestamp("2018-01-09")), + (50, Timestamp("2018-01-10")), + (50, Timestamp("2018-01-11")), + (50, Timestamp("2018-01-12")), + (50, Timestamp("2018-01-13")), + (50, Timestamp("2018-01-14")), + (50, Timestamp("2018-01-15")), + (50, Timestamp("2018-01-16")), + (50, Timestamp("2018-01-17")), + (50, Timestamp("2018-01-18")), + (50, Timestamp("2018-01-19")), + (50, Timestamp("2018-01-20")), + (50, Timestamp("2018-01-21")), + (60, Timestamp("2018-01-14")), + ], + names=["volume", "week_starting"], + ) expected = DataFrame( data={ diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index b838797b5f9b9..1d104b12ce7d2 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -79,9 +79,7 @@ def test_combine_first_dt64(self): s1 = Series([np.NaN, "2011"]) rs = s0.combine_first(s1) - msg = "containing strings is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - xp = Series([datetime(2010, 1, 1), "2011"]) + xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]") tm.assert_series_equal(rs, xp) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 26416c7a2b483..409a3b231fa95 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -365,10 +365,7 @@ def test_datetime64_fillna(self): def test_datetime64_fillna_backfill(self): # GH#6587 # make sure that we are treating as integer when filling - msg = "containing strings is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - # this also tests inference of a datetime-like with NaT's - ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"]) + ser = Series([NaT, NaT, "2013-08-05 15:30:00.000001"], dtype="M8[ns]") expected = Series( [ diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 35ebd152f447c..eccf6c9c92ea1 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1018,24 +1018,20 @@ def test_constructor_dtype_datetime64_7(self): assert series1.dtype == object def test_constructor_dtype_datetime64_6(self): - # these will correctly infer a datetime - msg = "containing strings is deprecated" + # as of 2.0, these no longer infer datetime64 based on the strings, + # matching the Index behavior - with tm.assert_produces_warning(FutureWarning, match=msg): - ser = Series([None, NaT, "2013-08-05 15:30:00.000001"]) - assert ser.dtype == "datetime64[ns]" + ser = Series([None, NaT, "2013-08-05 15:30:00.000001"]) + assert ser.dtype == object - with tm.assert_produces_warning(FutureWarning, match=msg): - ser = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"]) - assert ser.dtype == "datetime64[ns]" + ser = Series([np.nan, NaT, "2013-08-05 15:30:00.000001"]) + assert ser.dtype == object - with tm.assert_produces_warning(FutureWarning, match=msg): - ser = Series([NaT, None, "2013-08-05 15:30:00.000001"]) - assert ser.dtype == "datetime64[ns]" + ser = Series([NaT, None, "2013-08-05 15:30:00.000001"]) + assert ser.dtype == object - with tm.assert_produces_warning(FutureWarning, match=msg): - ser = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"]) - assert ser.dtype == "datetime64[ns]" + ser = Series([NaT, np.nan, "2013-08-05 15:30:00.000001"]) + assert ser.dtype == object def test_constructor_dtype_datetime64_5(self): # tz-aware (UTC and other tz's) @@ -1517,23 +1513,19 @@ def test_constructor_dtype_timedelta64(self): td = Series([timedelta(days=i) for i in range(3)] + ["foo"]) assert td.dtype == "object" - # these will correctly infer a timedelta - msg = "containing strings is deprecated" - with tm.assert_produces_warning(FutureWarning, match=msg): - ser = Series([None, NaT, "1 Day"]) - assert ser.dtype == "timedelta64[ns]" + # as of 2.0, these no longer infer timedelta64 based on the strings, + # matching Index behavior + ser = Series([None, NaT, "1 Day"]) + assert ser.dtype == object - with tm.assert_produces_warning(FutureWarning, match=msg): - ser = Series([np.nan, NaT, "1 Day"]) - assert ser.dtype == "timedelta64[ns]" + ser = Series([np.nan, NaT, "1 Day"]) + assert ser.dtype == object - with tm.assert_produces_warning(FutureWarning, match=msg): - ser = Series([NaT, None, "1 Day"]) - assert ser.dtype == "timedelta64[ns]" + ser = Series([NaT, None, "1 Day"]) + assert ser.dtype == object - with tm.assert_produces_warning(FutureWarning, match=msg): - ser = Series([NaT, np.nan, "1 Day"]) - assert ser.dtype == "timedelta64[ns]" + ser = Series([NaT, np.nan, "1 Day"]) + assert ser.dtype == object # GH 16406 def test_constructor_mixed_tz(self): diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index fd808328ef386..60d54a48965df 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -207,9 +207,7 @@ def test_to_timedelta_on_missing_values(self): ) tm.assert_series_equal(actual, expected) - with tm.assert_produces_warning(FutureWarning, match="Inferring timedelta64"): - ser = Series(["00:00:01", pd.NaT]) - assert ser.dtype == "m8[ns]" + ser = Series(["00:00:01", pd.NaT], dtype="m8[ns]") actual = to_timedelta(ser) tm.assert_series_equal(actual, expected)