From 2ea036f0491417786a662c320e504d7eb9aba83c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 31 May 2024 11:40:04 -0700 Subject: [PATCH] ENH/WIP: resolution inference in pd.to_datetime, DatetimeIndex (#55901) * ENH: read_stata return non-nano * GH ref * move whatsnew * remove outdated whatsnew * ENH: read_stata return non-nano * avoid Series.view * dont go through Series * TST: dt64 units * BUG: cut with non-nano * BUG: round with non-nanosecond raising OverflowError * woops * BUG: cut with non-nano * TST: parametrize tests over dt64 unit * xfail non-nano * revert * BUG: mixed-type mixed-timezone/awareness * commit so i can unstash something else i hope * ENH: infer resolution in to_datetime, DatetimeIndex * revert commented-out * revert commented-out * revert commented-out * remove commented-out * remove comment * revert unnecessary * revert unnecessary * fix window tests * Fix resample tests * restore comment * revert unnecessary * remove no-longer necessary * revert no-longer-necessary * revert no-longer-necessary * update tests * revert no-longer-necessary * update tests * revert bits * update tests * cleanup * revert * revert * parametrize over unit * update tests * update tests * revert no-longer-needed * revert no-longer-necessary * revert no-longer-necessary * revert no-longer-necessary * revert no-longer-necessary * Revert no-longer-necessary * update test * update test * simplify * update tests * update tests * update tests * revert no-longer-necessary * post-merge fixup * revert no-longer-necessary * update tests * update test * update tests * update tests * remove commented-out * revert no-longer-necessary * as_unit->astype * cleanup * merge fixup * revert bit * revert no-longer-necessary, xfail * update multithread test * update tests * update doctest * update tests * update doctests * update tests * update db tests * troubleshoot db tests * update test * troubleshoot sql tests * update test * update tests * mypy fixup * Update test * kludge test * update test * update for min-version tests * fix adbc check * troubleshoot minimum version deps * troubleshoot * troubleshoot * troubleshoot * whatsnew * update abdc-driver-postgresql minimum version * update doctest * fix doc example * troubleshoot test_api_custom_dateparsing_error * troubleshoot * troubleshoot * troubleshoot * troubleshoot * troubleshoot * troubleshoot * update exp instead of object cast * revert accidental * simplify test --- doc/source/whatsnew/v3.0.0.rst | 63 ++++++ pandas/_libs/lib.pyx | 10 +- pandas/_libs/tslib.pyx | 17 +- pandas/_libs/tslibs/strptime.pyx | 6 +- pandas/core/algorithms.py | 8 +- pandas/core/arrays/datetimelike.py | 12 +- pandas/core/arrays/datetimes.py | 35 ++-- pandas/core/base.py | 2 +- pandas/core/dtypes/cast.py | 2 +- pandas/core/dtypes/dtypes.py | 2 +- pandas/core/dtypes/missing.py | 4 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 10 +- pandas/core/groupby/generic.py | 4 +- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/datetimes.py | 5 +- pandas/core/series.py | 5 +- pandas/core/tools/datetimes.py | 16 +- pandas/tests/arithmetic/test_period.py | 7 +- .../tests/arrays/categorical/test_missing.py | 2 +- pandas/tests/arrays/test_array.py | 10 +- pandas/tests/base/test_constructors.py | 6 +- pandas/tests/base/test_conversion.py | 4 +- pandas/tests/dtypes/test_inference.py | 6 +- pandas/tests/extension/test_arrow.py | 2 +- .../frame/constructors/test_from_records.py | 2 +- pandas/tests/frame/indexing/test_setitem.py | 3 +- .../tests/frame/methods/test_combine_first.py | 25 ++- .../tests/frame/methods/test_infer_objects.py | 2 +- pandas/tests/frame/methods/test_map.py | 7 +- pandas/tests/frame/methods/test_to_csv.py | 41 +++- pandas/tests/frame/test_constructors.py | 61 ++++-- pandas/tests/groupby/test_apply.py | 4 +- pandas/tests/groupby/test_groupby.py | 2 +- .../indexes/datetimes/test_constructors.py | 62 +++--- .../indexes/datetimes/test_date_range.py | 4 +- pandas/tests/indexes/test_base.py | 2 +- pandas/tests/indexes/test_index_new.py | 11 +- pandas/tests/indexing/test_coercion.py | 20 +- pandas/tests/indexing/test_loc.py | 7 +- pandas/tests/indexing/test_partial.py | 2 +- pandas/tests/interchange/test_impl.py | 3 +- pandas/tests/io/excel/test_readers.py | 9 +- pandas/tests/io/excel/test_writers.py | 16 +- pandas/tests/io/json/test_pandas.py | 17 +- .../io/parser/common/test_common_basic.py | 9 +- pandas/tests/io/parser/common/test_index.py | 3 +- pandas/tests/io/parser/test_multi_thread.py | 9 +- pandas/tests/io/parser/test_parse_dates.py | 58 ++++-- pandas/tests/io/parser/test_read_fwf.py | 4 +- pandas/tests/io/parser/test_skiprows.py | 8 +- .../io/parser/usecols/test_parse_dates.py | 2 +- pandas/tests/io/pytables/test_store.py | 12 +- pandas/tests/io/test_fsspec.py | 16 +- pandas/tests/io/test_gcs.py | 6 +- pandas/tests/io/test_html.py | 14 +- pandas/tests/io/test_orc.py | 2 + pandas/tests/io/test_parquet.py | 44 +++- pandas/tests/io/test_sql.py | 52 +++-- pandas/tests/io/test_stata.py | 23 +- pandas/tests/resample/test_base.py | 3 +- pandas/tests/resample/test_time_grouper.py | 12 +- .../reshape/concat/test_append_common.py | 4 +- pandas/tests/reshape/concat/test_datetimes.py | 4 +- pandas/tests/reshape/test_cut.py | 52 +++-- pandas/tests/reshape/test_qcut.py | 4 +- pandas/tests/scalar/test_nat.py | 6 +- .../series/methods/test_combine_first.py | 2 +- pandas/tests/series/methods/test_fillna.py | 2 +- pandas/tests/series/methods/test_to_csv.py | 5 +- pandas/tests/series/test_constructors.py | 34 ++- pandas/tests/test_algos.py | 1 + pandas/tests/tools/test_to_datetime.py | 197 +++++++++++------- pandas/tests/tools/test_to_timedelta.py | 2 +- pandas/tests/tseries/holiday/test_calendar.py | 4 +- pandas/tests/tslibs/test_array_to_datetime.py | 50 +++-- pandas/tests/util/test_hashing.py | 16 +- 77 files changed, 745 insertions(+), 457 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 6a6abcf2d48fe..865996bdf8892 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -124,6 +124,69 @@ notable_bug_fix2 Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +.. _whatsnew_300.api_breaking.datetime_resolution_inference: + +Datetime resolution inference +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Converting a sequence of strings, ``datetime`` objects, or ``np.datetime64`` objects to +a ``datetime64`` dtype now performs inference on the appropriate resolution (AKA unit) for the output dtype. This affects :class:`Series`, :class:`DataFrame`, :class:`Index`, :class:`DatetimeIndex`, and :func:`to_datetime`. + +Previously, these would always give nanosecond resolution: + +.. code-block:: ipython + + In [1]: dt = pd.Timestamp("2024-03-22 11:36").to_pydatetime() + In [2]: pd.to_datetime([dt]).dtype + Out[2]: dtype('>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")])) - array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]') + array(['2016-01-01T00:00:00'], dtype='datetime64[s]') >>> pd.unique( ... pd.Series( ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), - ... ] + ... ], + ... dtype="M8[ns, US/Eastern]", ... ) ... ) @@ -365,7 +366,8 @@ def unique(values): ... [ ... pd.Timestamp("20160101", tz="US/Eastern"), ... pd.Timestamp("20160101", tz="US/Eastern"), - ... ] + ... ], + ... dtype="M8[ns, US/Eastern]", ... ) ... ) DatetimeIndex(['2016-01-01 00:00:00-05:00'], diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 925858a20ce41..673001337767b 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -1849,11 +1849,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ _floor_example = """>>> rng.floor('h') @@ -1876,11 +1876,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.floor("2h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.floor("2h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ _ceil_example = """>>> rng.ceil('h') @@ -1903,11 +1903,11 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]: >>> rng_tz.ceil("h", ambiguous=False) DatetimeIndex(['2021-10-31 02:00:00+01:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) >>> rng_tz.ceil("h", ambiguous=True) DatetimeIndex(['2021-10-31 02:00:00+02:00'], - dtype='datetime64[ns, Europe/Amsterdam]', freq=None) + dtype='datetime64[s, Europe/Amsterdam]', freq=None) """ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index b075e3d299ed0..bbbf7a9b4a63a 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -218,7 +218,7 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): # type: ignore[misc] ... ) ['2023-01-01 00:00:00', '2023-01-02 00:00:00'] - Length: 2, dtype: datetime64[ns] + Length: 2, dtype: datetime64[s] """ _typ = "datetimearray" @@ -613,7 +613,7 @@ def tz(self) -> tzinfo | None: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.tz datetime.timezone.utc @@ -1047,7 +1047,7 @@ def tz_localize( 4 2018-10-28 02:30:00+01:00 5 2018-10-28 03:00:00+01:00 6 2018-10-28 03:30:00+01:00 - dtype: datetime64[ns, CET] + dtype: datetime64[s, CET] In some cases, inferring the DST is impossible. In such cases, you can pass an ndarray to the ambiguous parameter to set the DST explicitly @@ -1059,14 +1059,14 @@ def tz_localize( 0 2018-10-28 01:20:00+02:00 1 2018-10-28 02:36:00+02:00 2 2018-10-28 03:46:00+01:00 - dtype: datetime64[ns, CET] + dtype: datetime64[s, CET] If the DST transition causes nonexistent times, you can shift these dates forward or backwards with a timedelta object or `'shift_forward'` or `'shift_backwards'`. >>> s = pd.to_datetime(pd.Series(['2015-03-29 02:30:00', - ... '2015-03-29 03:30:00'])) + ... '2015-03-29 03:30:00'], dtype="M8[ns]")) >>> s.dt.tz_localize('Europe/Warsaw', nonexistent='shift_forward') 0 2015-03-29 03:00:00+02:00 1 2015-03-29 03:30:00+02:00 @@ -1427,7 +1427,7 @@ def time(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.time 0 10:00:00 1 11:00:00 @@ -1470,7 +1470,7 @@ def timetz(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.timetz 0 10:00:00+00:00 1 11:00:00+00:00 @@ -1512,7 +1512,7 @@ def date(self) -> npt.NDArray[np.object_]: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.date 0 2020-01-01 1 2020-02-01 @@ -1861,7 +1861,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.dayofyear 0 1 1 32 @@ -1897,7 +1897,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-04-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.quarter 0 1 1 2 @@ -1933,7 +1933,7 @@ def isocalendar(self) -> DataFrame: >>> s 0 2020-01-01 10:00:00+00:00 1 2020-02-01 11:00:00+00:00 - dtype: datetime64[ns, UTC] + dtype: datetime64[s, UTC] >>> s.dt.daysinmonth 0 31 1 29 @@ -2372,9 +2372,9 @@ def _sequence_to_dt64( data, copy = maybe_convert_dtype(data, copy, tz=tz) data_dtype = getattr(data, "dtype", None) - if out_unit is None: - out_unit = "ns" - out_dtype = np.dtype(f"M8[{out_unit}]") + out_dtype = DT64NS_DTYPE + if out_unit is not None: + out_dtype = np.dtype(f"M8[{out_unit}]") if data_dtype == object or is_string_dtype(data_dtype): # TODO: We do not have tests specific to string-dtypes, @@ -2400,7 +2400,7 @@ def _sequence_to_dt64( dayfirst=dayfirst, yearfirst=yearfirst, allow_object=False, - out_unit=out_unit or "ns", + out_unit=out_unit, ) copy = False if tz and inferred_tz: @@ -2508,7 +2508,7 @@ def objects_to_datetime64( utc: bool = False, errors: DateTimeErrorChoices = "raise", allow_object: bool = False, - out_unit: str = "ns", + out_unit: str | None = None, ) -> tuple[np.ndarray, tzinfo | None]: """ Convert data to array of timestamps. @@ -2524,7 +2524,8 @@ def objects_to_datetime64( allow_object : bool Whether to return an object-dtype ndarray instead of raising if the data contains more than one timezone. - out_unit : str, default "ns" + out_unit : str or None, default None + None indicates we should do resolution inference. Returns ------- diff --git a/pandas/core/base.py b/pandas/core/base.py index 5cdbde8c64c47..b784dc8b03292 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1334,7 +1334,7 @@ def factorize( 0 2000-03-11 1 2000-03-12 2 2000-03-13 - dtype: datetime64[ns] + dtype: datetime64[s] >>> ser.searchsorted('3/14/2000') 3 diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 08adb580ff08f..662b8c5791e51 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1193,7 +1193,7 @@ def maybe_infer_to_datetimelike( # numpy would have done it for us. convert_numeric=False, convert_non_numeric=True, - dtype_if_all_nat=np.dtype("M8[ns]"), + dtype_if_all_nat=np.dtype("M8[s]"), ) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 45814ca77b70f..5213be8b69016 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -205,7 +205,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): by providing an empty index. As follows, >>> pd.CategoricalDtype(pd.DatetimeIndex([])).categories.dtype - dtype(' bool | npt.NDArray[np.bool_] | NDFrame: >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, "2017-07-08"]) >>> index DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> pd.isna(index) array([False, False, True, False]) @@ -362,7 +362,7 @@ def notna(obj: object) -> bool | npt.NDArray[np.bool_] | NDFrame: >>> index = pd.DatetimeIndex(["2017-07-05", "2017-07-06", None, "2017-07-08"]) >>> index DatetimeIndex(['2017-07-05', '2017-07-06', 'NaT', '2017-07-08'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> pd.notna(index) array([ True, True, False, True]) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 912b4353acacf..97a4e414608b8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -13286,7 +13286,7 @@ def to_period( >>> idx DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> idx.to_period("M") PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]') diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ca60ca9b48a14..22eecdc95934f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3209,7 +3209,7 @@ class (index) object 32B 'bird' 'bird' 'mammal' 'mammal' Dimensions: (date: 2, animal: 2) Coordinates: - * date (date) datetime64[ns] 2018-01-01 2018-01-02 + * date (date) datetime64[s] 2018-01-01 2018-01-02 * animal (animal) object 'falcon' 'parrot' Data variables: speed (date, animal) int64 350 18 361 15 @@ -6194,7 +6194,7 @@ def dtypes(self): >>> df.dtypes float float64 int int64 - datetime datetime64[ns] + datetime datetime64[s] string object dtype: object """ @@ -10653,10 +10653,10 @@ def tz_localize( dates forward or backward with a timedelta object or `'shift_forward'` or `'shift_backward'`. - >>> s = pd.Series( - ... range(2), - ... index=pd.DatetimeIndex(["2015-03-29 02:30:00", "2015-03-29 03:30:00"]), + >>> dti = pd.DatetimeIndex( + ... ["2015-03-29 02:30:00", "2015-03-29 03:30:00"], dtype="M8[ns]" ... ) + >>> s = pd.Series(range(2), index=dti) >>> s.tz_localize("Europe/Warsaw", nonexistent="shift_forward") 2015-03-29 03:00:00+02:00 0 2015-03-29 03:30:00+02:00 1 diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index a20577e8d3df9..0c4f22f736d4a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1206,7 +1206,7 @@ def idxmin(self, skipna: bool = True) -> Series: >>> ser.groupby(["a", "a", "b", "b"]).idxmin() a 2023-01-01 b 2023-02-01 - dtype: datetime64[ns] + dtype: datetime64[s] """ return self._idxmax_idxmin("idxmin", skipna=skipna) @@ -1259,7 +1259,7 @@ def idxmax(self, skipna: bool = True) -> Series: >>> ser.groupby(["a", "a", "b", "b"]).idxmax() a 2023-01-15 b 2023-02-15 - dtype: datetime64[ns] + dtype: datetime64[s] """ return self._idxmax_idxmin("idxmax", skipna=skipna) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 6a3fb8bc851df..56030a15dc143 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2634,7 +2634,7 @@ def isna(self) -> npt.NDArray[np.bool_]: ... ) >>> idx DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) >>> idx.isna() array([False, True, True, True]) """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 78f04f57029b1..930bc7a95bd14 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -242,7 +242,7 @@ class DatetimeIndex(DatetimeTimedeltaMixin): >>> idx = pd.DatetimeIndex(["1/1/2020 10:00:00+00:00", "2/1/2020 11:00:00+00:00"]) >>> idx DatetimeIndex(['2020-01-01 10:00:00+00:00', '2020-02-01 11:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) """ _typ = "datetimeindex" @@ -473,7 +473,8 @@ def snap(self, freq: Frequency = "S") -> DatetimeIndex: Examples -------- >>> idx = pd.DatetimeIndex( - ... ["2023-01-01", "2023-01-02", "2023-02-01", "2023-02-02"] + ... ["2023-01-01", "2023-01-02", "2023-02-01", "2023-02-02"], + ... dtype="M8[ns]", ... ) >>> idx DatetimeIndex(['2023-01-01', '2023-01-02', '2023-02-01', '2023-02-02'], diff --git a/pandas/core/series.py b/pandas/core/series.py index c49eef49f7393..f67c0753fa9df 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2060,14 +2060,14 @@ def unique(self) -> ArrayLike: >>> pd.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).unique() ['2016-01-01 00:00:00'] - Length: 1, dtype: datetime64[ns] + Length: 1, dtype: datetime64[s] >>> pd.Series( ... [pd.Timestamp("2016-01-01", tz="US/Eastern") for _ in range(3)] ... ).unique() ['2016-01-01 00:00:00-05:00'] - Length: 1, dtype: datetime64[ns, US/Eastern] + Length: 1, dtype: datetime64[s, US/Eastern] An Categorical will return categories in the order of appearance and with the same dtype. @@ -3175,6 +3175,7 @@ def combine_first(self, other) -> Series: other = other.reindex(keep_other) if this.dtype.kind == "M" and other.dtype.kind != "M": + # TODO: try to match resos? other = to_datetime(other) combined = concat([this, other]) combined = combined.reindex(new_index) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b01cdb335ec46..c116ef015ae16 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -29,6 +29,7 @@ timezones as libtimezones, ) from pandas._libs.tslibs.conversion import cast_from_unit_vectorized +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit from pandas._libs.tslibs.parsing import ( DateParseError, guess_datetime_format, @@ -524,6 +525,7 @@ def _to_datetime_with_unit(arg, unit, name, utc: bool, errors: str) -> Index: utc=utc, errors=errors, unit_for_numerics=unit, + creso=NpyDatetimeUnit.NPY_FR_ns.value, ) result = DatetimeIndex(arr, name=name) @@ -873,7 +875,7 @@ def to_datetime( >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 - dtype: datetime64[ns] + dtype: datetime64[s] Using a unix epoch time @@ -903,7 +905,7 @@ def to_datetime( Passing ``errors='coerce'`` will force an out-of-bounds date to :const:`NaT`, in addition to forcing non-dates (or non-parseable dates) to :const:`NaT`. - >>> pd.to_datetime("13000101", format="%Y%m%d", errors="coerce") + >>> pd.to_datetime("invalid for Ymd", format="%Y%m%d", errors="coerce") NaT .. _to_datetime_tz_examples: @@ -916,14 +918,14 @@ def to_datetime( >>> pd.to_datetime(["2018-10-26 12:00:00", "2018-10-26 13:00:15"]) DatetimeIndex(['2018-10-26 12:00:00', '2018-10-26 13:00:15'], - dtype='datetime64[ns]', freq=None) + dtype='datetime64[s]', freq=None) - Timezone-aware inputs *with constant time offset* are converted to timezone-aware :class:`DatetimeIndex`: >>> pd.to_datetime(["2018-10-26 12:00 -0500", "2018-10-26 13:00 -0500"]) DatetimeIndex(['2018-10-26 12:00:00-05:00', '2018-10-26 13:00:00-05:00'], - dtype='datetime64[ns, UTC-05:00]', freq=None) + dtype='datetime64[s, UTC-05:00]', freq=None) - However, timezone-aware inputs *with mixed time offsets* (for example issued from a timezone with daylight savings, such as Europe/Paris) @@ -965,21 +967,21 @@ def to_datetime( >>> pd.to_datetime(["2018-10-26 12:00", "2018-10-26 13:00"], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2018-10-26 13:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) - Timezone-aware inputs are *converted* to UTC (the output represents the exact same datetime, but viewed from the UTC time offset `+00:00`). >>> pd.to_datetime(["2018-10-26 12:00 -0530", "2018-10-26 12:00 -0500"], utc=True) DatetimeIndex(['2018-10-26 17:30:00+00:00', '2018-10-26 17:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[s, UTC]', freq=None) - Inputs can contain both string or datetime, the above rules still apply >>> pd.to_datetime(["2018-10-26 12:00", datetime(2020, 1, 1, 18)], utc=True) DatetimeIndex(['2018-10-26 12:00:00+00:00', '2020-01-01 18:00:00+00:00'], - dtype='datetime64[ns, UTC]', freq=None) + dtype='datetime64[us, UTC]', freq=None) """ if exact is not lib.no_default and format in {"mixed", "ISO8601"}: raise ValueError("Cannot use 'exact' when 'format' is 'mixed' or 'ISO8601'") diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py index 18f1993c198df..539df9d61a7b2 100644 --- a/pandas/tests/arithmetic/test_period.py +++ b/pandas/tests/arithmetic/test_period.py @@ -1361,7 +1361,12 @@ def test_period_add_timestamp_raises(self, box_with_array): arr + ts with pytest.raises(TypeError, match=msg): ts + arr - msg = "cannot add PeriodArray and DatetimeArray" + if box_with_array is pd.DataFrame: + # TODO: before implementing resolution-inference we got the same + # message with DataFrame and non-DataFrame. Why did that change? + msg = "cannot add PeriodArray and Timestamp" + else: + msg = "cannot add PeriodArray and DatetimeArray" with pytest.raises(TypeError, match=msg): arr + Series([ts]) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/arrays/categorical/test_missing.py b/pandas/tests/arrays/categorical/test_missing.py index 9d4b78ce9944e..e3cb9664e19f2 100644 --- a/pandas/tests/arrays/categorical/test_missing.py +++ b/pandas/tests/arrays/categorical/test_missing.py @@ -121,7 +121,7 @@ def test_compare_categorical_with_missing(self, a1, a2, categories): @pytest.mark.parametrize( "na_value, dtype", [ - (pd.NaT, "datetime64[ns]"), + (pd.NaT, "datetime64[s]"), (None, "float64"), (np.nan, "float64"), (pd.NA, "float64"), diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 857509e18fa8e..97d57163ed079 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -125,7 +125,7 @@ def test_dt64_array(dtype_unit): ( pd.DatetimeIndex(["2000", "2001"]), None, - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[s]"), ), ( ["2000", "2001"], @@ -301,11 +301,11 @@ def test_array_copy(): # datetime ( [pd.Timestamp("2000"), pd.Timestamp("2001")], - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[s]"), ), ( [datetime.datetime(2000, 1, 1), datetime.datetime(2001, 1, 1)], - DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[ns]"), + DatetimeArray._from_sequence(["2000", "2001"], dtype="M8[us]"), ), ( np.array([1, 2], dtype="M8[ns]"), @@ -321,7 +321,7 @@ def test_array_copy(): ( [pd.Timestamp("2000", tz="CET"), pd.Timestamp("2001", tz="CET")], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="ns") + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz="CET", unit="s") ), ), ( @@ -330,7 +330,7 @@ def test_array_copy(): datetime.datetime(2001, 1, 1, tzinfo=cet), ], DatetimeArray._from_sequence( - ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="ns") + ["2000", "2001"], dtype=pd.DatetimeTZDtype(tz=cet, unit="us") ), ), # timedelta diff --git a/pandas/tests/base/test_constructors.py b/pandas/tests/base/test_constructors.py index f3ac60f672ee1..c4b02423f8cf0 100644 --- a/pandas/tests/base/test_constructors.py +++ b/pandas/tests/base/test_constructors.py @@ -146,10 +146,12 @@ def test_constructor_datetime_outofbound( # No dtype specified (dtype inference) # datetime64[non-ns] raise error, other cases result in object dtype # and preserve original data - if a.dtype.kind == "M": + result = constructor(a) + if a.dtype.kind == "M" or isinstance(a[0], np.datetime64): # Can't fit in nanosecond bounds -> get the nearest supported unit - result = constructor(a) assert result.dtype == "M8[s]" + elif isinstance(a[0], datetime): + assert result.dtype == "M8[us]", result.dtype else: result = constructor(a) if using_infer_string and "object-string" in request.node.callspec.id: diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py index 6c0df49b0a93a..dd6bf3c7521f8 100644 --- a/pandas/tests/base/test_conversion.py +++ b/pandas/tests/base/test_conversion.py @@ -412,7 +412,7 @@ def test_to_numpy_dtype(as_series): [Timestamp("2000"), Timestamp("2000"), pd.NaT], None, Timestamp("2000"), - [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + [np.datetime64("2000-01-01T00:00:00", "s")] * 3, ), ], ) @@ -454,7 +454,7 @@ def test_to_numpy_na_value_numpy_dtype( [(0, Timestamp("2021")), (0, Timestamp("2022")), (1, Timestamp("2000"))], None, Timestamp("2000"), - [np.datetime64("2000-01-01T00:00:00.000000000")] * 3, + [np.datetime64("2000-01-01T00:00:00", "s")] * 3, ), ], ) diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index f4282c9c7ac3a..db18cd4aef14e 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -830,7 +830,11 @@ def test_maybe_convert_objects_datetime_overflow_safe(self, dtype): out = lib.maybe_convert_objects(arr, convert_non_numeric=True) # no OutOfBoundsDatetime/OutOfBoundsTimedeltas - tm.assert_numpy_array_equal(out, arr) + if dtype == "datetime64[ns]": + expected = np.array(["2363-10-04"], dtype="M8[us]") + else: + expected = arr + tm.assert_numpy_array_equal(out, expected) def test_maybe_convert_objects_mixed_datetimes(self): ts = Timestamp("now") diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7d31fe6085c3a..5926d23b44dd0 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3445,7 +3445,7 @@ def test_arrow_floor_division_large_divisor(dtype): def test_string_to_datetime_parsing_cast(): # GH 56266 string_dates = ["2020-01-01 04:30:00", "2020-01-02 00:00:00", "2020-01-03 00:00:00"] - result = pd.Series(string_dates, dtype="timestamp[ns][pyarrow]") + result = pd.Series(string_dates, dtype="timestamp[s][pyarrow]") expected = pd.Series( ArrowExtensionArray(pa.array(pd.to_datetime(string_dates), from_pandas=True)) ) diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 66fc234e79b4d..35e143fcedf7b 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -39,7 +39,7 @@ def test_from_records_with_datetimes(self): expected = DataFrame({"EXPIRY": [datetime(2005, 3, 1, 0, 0), None]}) arrdata = [np.array([datetime(2005, 3, 1, 0, 0), None])] - dtypes = [("EXPIRY", " None: ) with tm.assert_produces_warning(FutureWarning, match=msg): result = date_range("2010-01-01", periods=2, freq="m") - expected = DatetimeIndex(["2010-01-31", "2010-02-28"], freq="ME") + expected = DatetimeIndex( + ["2010-01-31", "2010-02-28"], dtype="M8[ns]", freq="ME" + ) tm.assert_index_equal(result, expected) def test_date_range_bday(self): diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 2e94961b673f8..bd38e6c2ff333 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -186,7 +186,7 @@ def test_constructor_int_dtype_nan(self): "klass,dtype,na_val", [ (Index, np.float64, np.nan), - (DatetimeIndex, "datetime64[ns]", pd.NaT), + (DatetimeIndex, "datetime64[s]", pd.NaT), ], ) def test_index_ctor_infer_nan_nat(self, klass, dtype, na_val): diff --git a/pandas/tests/indexes/test_index_new.py b/pandas/tests/indexes/test_index_new.py index b544ebac43ece..4a31ae88a757a 100644 --- a/pandas/tests/indexes/test_index_new.py +++ b/pandas/tests/indexes/test_index_new.py @@ -61,16 +61,16 @@ def test_infer_nat(self, val): values = [NaT, val] idx = Index(values) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(values[::-1]) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(np.array(values, dtype=object)) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() idx = Index(np.array(values, dtype=object)[::-1]) - assert idx.dtype == "datetime64[ns]" and idx.isna().all() + assert idx.dtype == "datetime64[s]" and idx.isna().all() @pytest.mark.parametrize("na_value", [None, np.nan]) @pytest.mark.parametrize("vtype", [list, tuple, iter]) @@ -138,6 +138,9 @@ def test_constructor_infer_nat_dt_like( ) expected = klass([NaT, NaT]) + if dtype[0] == "d": + # we infer all-NaT as second resolution + expected = expected.astype("M8[ns]") assert expected.dtype == dtype data = [ctor] data.insert(pos, nulls_fixture) diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py index d4bc0341e732e..84cd0d3b08b7b 100644 --- a/pandas/tests/indexing/test_coercion.py +++ b/pandas/tests/indexing/test_coercion.py @@ -598,7 +598,7 @@ def test_fillna_complex128(self, index_or_series, fill_val, fill_dtype): @pytest.mark.parametrize( "fill_val,fill_dtype", [ - (pd.Timestamp("2012-01-01"), "datetime64[ns]"), + (pd.Timestamp("2012-01-01"), "datetime64[s]"), (pd.Timestamp("2012-01-01", tz="US/Eastern"), object), (1, object), ("x", object), @@ -615,7 +615,7 @@ def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): pd.Timestamp("2011-01-04"), ] ) - assert obj.dtype == "datetime64[ns]" + assert obj.dtype == "datetime64[s]" exp = klass( [ @@ -630,10 +630,10 @@ def test_fillna_datetime(self, index_or_series, fill_val, fill_dtype): @pytest.mark.parametrize( "fill_val,fill_dtype", [ - (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="US/Eastern"), "datetime64[s, US/Eastern]"), (pd.Timestamp("2012-01-01"), object), # pre-2.0 with a mismatched tz we would get object result - (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), "datetime64[ns, US/Eastern]"), + (pd.Timestamp("2012-01-01", tz="Asia/Tokyo"), "datetime64[s, US/Eastern]"), (1, object), ("x", object), ], @@ -650,7 +650,7 @@ def test_fillna_datetime64tz(self, index_or_series, fill_val, fill_dtype): pd.Timestamp("2011-01-04", tz=tz), ] ) - assert obj.dtype == "datetime64[ns, US/Eastern]" + assert obj.dtype == "datetime64[s, US/Eastern]" if getattr(fill_val, "tz", None) is None: fv = fill_val @@ -830,6 +830,7 @@ def replacer(self, how, from_key, to_key): def test_replace_series(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xxx") obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = obj.astype(from_key) assert obj.dtype == from_key if from_key.startswith("datetime") and to_key.startswith("datetime"): @@ -850,7 +851,6 @@ def test_replace_series(self, how, to_key, from_key, replacer): else: exp = pd.Series(self.rep[to_key], index=index, name="yyy") - assert exp.dtype == to_key result = obj.replace(replacer) tm.assert_series_equal(result, exp, check_dtype=False) @@ -867,7 +867,7 @@ def test_replace_series_datetime_tz( self, how, to_key, from_key, replacer, using_infer_string ): index = pd.Index([3, 4], name="xyz") - obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = pd.Series(self.rep[from_key], index=index, name="yyy").dt.as_unit("ns") assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") @@ -891,7 +891,7 @@ def test_replace_series_datetime_tz( ) def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer): index = pd.Index([3, 4], name="xyz") - obj = pd.Series(self.rep[from_key], index=index, name="yyy") + obj = pd.Series(self.rep[from_key], index=index, name="yyy").dt.as_unit("ns") assert obj.dtype == from_key exp = pd.Series(self.rep[to_key], index=index, name="yyy") @@ -900,8 +900,8 @@ def test_replace_series_datetime_datetime(self, how, to_key, from_key, replacer) ): # with mismatched tzs, we retain the original dtype as of 2.0 exp = exp.astype(obj.dtype) - else: - assert exp.dtype == to_key + elif to_key == from_key: + exp = exp.dt.as_unit("ns") result = obj.replace(replacer) tm.assert_series_equal(result, exp, check_dtype=False) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 01dab14c7e528..16f3e0fd0c229 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -711,7 +711,7 @@ def test_loc_modify_datetime(self): {"date": [1485264372711, 1485265925110, 1540215845888, 1540282121025]} ) - df["date_dt"] = to_datetime(df["date"], unit="ms", cache=True) + df["date_dt"] = to_datetime(df["date"], unit="ms", cache=True).dt.as_unit("ms") df.loc[:, "date_dt_cp"] = df.loc[:, "date_dt"] df.loc[[2, 3], "date_dt_cp"] = df.loc[[2, 3], "date_dt"] @@ -865,6 +865,7 @@ def test_loc_setitem_frame_multiples(self): "val": Series([0, 1, 0, 1, 2], dtype=np.int64), } ) + expected["date"] = expected["date"].astype("M8[ns]") rhs = df.loc[0:2] rhs.index = df.index[2:5] df.loc[2:4] = rhs @@ -1814,7 +1815,7 @@ def test_loc_getitem_datetime_string_with_datetimeindex(self): result = df.loc[["2010-01-01", "2010-01-05"], ["a", "b"]] expected = DataFrame( {"a": [0, 4], "b": [0, 4]}, - index=DatetimeIndex(["2010-01-01", "2010-01-05"]), + index=DatetimeIndex(["2010-01-01", "2010-01-05"]).as_unit("ns"), ) tm.assert_frame_equal(result, expected) @@ -2082,7 +2083,7 @@ def test_setitem_with_expansion(self): expected = Series([v[0].tz_convert("UTC"), df.loc[1, "time"]], name="time") tm.assert_series_equal(df2.time, expected) - v = df.loc[df.new_col == "new", "time"] + Timedelta("1s") + v = df.loc[df.new_col == "new", "time"] + Timedelta("1s").as_unit("s") df.loc[df.new_col == "new", "time"] = v tm.assert_series_equal(df.loc[df.new_col == "new", "time"], v) diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index b0a041ed5b69c..4d232d5ed1312 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -580,7 +580,7 @@ def test_partial_set_invalid(self): ], ), ( - date_range(start="2000", periods=20, freq="D"), + date_range(start="2000", periods=20, freq="D", unit="s"), ["2000-01-04", "2000-01-08", "2000-01-12"], [ Timestamp("2000-01-04"), diff --git a/pandas/tests/interchange/test_impl.py b/pandas/tests/interchange/test_impl.py index 60e05c2c65124..64eca6ac643ca 100644 --- a/pandas/tests/interchange/test_impl.py +++ b/pandas/tests/interchange/test_impl.py @@ -603,7 +603,8 @@ def test_empty_dataframe(): ), ( pd.Series( - [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)] + [datetime(2022, 1, 1), datetime(2022, 1, 2), datetime(2022, 1, 3)], + dtype="M8[ns]", ), (DtypeKind.DATETIME, 64, "tsn:", "="), (DtypeKind.INT, 64, ArrowCTypes.INT64, "="), diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index f0a72ba6163fa..6d6c3ad6b77a7 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -141,10 +141,13 @@ def df_ref(datapath): def get_exp_unit(read_ext: str, engine: str | None) -> str: - return "ns" + unit = "us" + if (read_ext == ".ods") ^ (engine == "calamine"): + unit = "s" + return unit -def adjust_expected(expected: DataFrame, read_ext: str, engine: str) -> None: +def adjust_expected(expected: DataFrame, read_ext: str, engine: str | None) -> None: expected.index.name = None unit = get_exp_unit(read_ext, engine) # error: "Index" has no attribute "as_unit" @@ -1117,7 +1120,6 @@ def test_read_excel_multiindex_blank_after_name( mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]], names=["c1", "c2"]) unit = get_exp_unit(read_ext, engine) - expected = DataFrame( [ [1, 2.5, pd.Timestamp("2015-01-01"), True], @@ -1675,6 +1677,7 @@ def test_read_datetime_multiindex(self, request, engine, read_ext): actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine) unit = get_exp_unit(read_ext, engine) + dti = pd.DatetimeIndex(["2020-02-29", "2020-03-01"], dtype=f"M8[{unit}]") expected_column_index = MultiIndex.from_arrays( [dti[:1], dti[1:]], diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 859152db84b7d..744fe20e4995d 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -37,7 +37,9 @@ def get_exp_unit(path: str) -> str: - return "ns" + if path.endswith(".ods"): + return "s" + return "us" @pytest.fixture @@ -293,12 +295,15 @@ def test_read_excel_parse_dates(self, tmp_excel): tm.assert_frame_equal(df2, res) res = pd.read_excel(tmp_excel, parse_dates=["date_strings"], index_col=0) - tm.assert_frame_equal(df, res) + expected = df[:] + expected["date_strings"] = expected["date_strings"].astype("M8[s]") + tm.assert_frame_equal(res, expected) res = pd.read_excel( tmp_excel, parse_dates=["date_strings"], date_format="%m/%d/%Y", index_col=0 ) - tm.assert_frame_equal(df, res) + expected["date_strings"] = expected["date_strings"].astype("M8[s]") + tm.assert_frame_equal(expected, res) def test_multiindex_interval_datetimes(self, tmp_excel): # GH 30986 @@ -547,6 +552,7 @@ def test_sheets(self, frame, tmp_excel): columns=Index(list("ABCD")), index=date_range("2000-01-01", periods=5, freq="B"), ) + index = pd.DatetimeIndex(np.asarray(tsframe.index), freq=None) tsframe.index = index @@ -695,7 +701,6 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): # # Excel output format strings unit = get_exp_unit(tmp_excel) - df = DataFrame( [ [date(2014, 1, 31), date(1999, 9, 24)], @@ -732,6 +737,9 @@ def test_excel_date_datetime_format(self, ext, tmp_excel, tmp_path): with ExcelFile(filename2) as reader2: rs2 = pd.read_excel(reader2, sheet_name="test1", index_col=0) + # TODO: why do we get different units? + rs2 = rs2.astype(f"M8[{unit}]") + tm.assert_frame_equal(rs1, rs2) # Since the reader returns a datetime object for dates, diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c4065ea01988f..b53957a7e77d1 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -133,7 +133,13 @@ def test_frame_non_unique_index_raises(self, orient): [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]], ], ) - def test_frame_non_unique_columns(self, orient, data): + def test_frame_non_unique_columns(self, orient, data, request): + if isinstance(data[0][0], Timestamp) and orient == "split": + mark = pytest.mark.xfail( + reason="GH#55827 non-nanosecond dt64 fails to round-trip" + ) + request.applymarker(mark) + df = DataFrame(data, index=[1, 2], columns=["x", "x"]) expected_warning = None @@ -141,7 +147,7 @@ def test_frame_non_unique_columns(self, orient, data): "The default 'epoch' date format is deprecated and will be removed " "in a future version, please use 'iso' date format instead." ) - if df.iloc[:, 0].dtype == "datetime64[ns]": + if df.iloc[:, 0].dtype == "datetime64[s]": expected_warning = FutureWarning with tm.assert_produces_warning(expected_warning, match=msg): @@ -150,7 +156,7 @@ def test_frame_non_unique_columns(self, orient, data): ) if orient == "values": expected = DataFrame(data) - if expected.iloc[:, 0].dtype == "datetime64[ns]": + if expected.iloc[:, 0].dtype == "datetime64[s]": # orient == "values" by default will write Timestamp objects out # in milliseconds; these are internally stored in nanosecond, # so divide to get where we need @@ -856,6 +862,10 @@ def test_date_index_and_values(self, date_format, as_object, date_typ): data.append("a") ser = Series(data, index=data) + if not as_object: + ser = ser.astype("M8[ns]") + if isinstance(ser.index, DatetimeIndex): + ser.index = ser.index.as_unit("ns") expected_warning = None if date_format == "epoch": @@ -897,6 +907,7 @@ def test_convert_dates_infer(self, infer_word): expected = DataFrame( [[1, Timestamp("2002-11-08")], [2, pd.NaT]], columns=["id", infer_word] ) + expected[infer_word] = expected[infer_word].astype("M8[ns]") result = read_json(StringIO(ujson_dumps(data)))[["id", infer_word]] tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index df76b46cc6a7b..b665cfba8bdc0 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -40,9 +40,7 @@ def test_read_csv_local(all_parsers, csv1): fname = prefix + str(os.path.abspath(csv1)) result = parser.read_csv(fname, index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") + expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -64,6 +62,7 @@ def test_read_csv_local(all_parsers, csv1): datetime(2000, 1, 10), datetime(2000, 1, 11), ], + dtype="M8[s]", name="index", ), ) @@ -144,9 +143,6 @@ def test_read_csv_low_memory_no_rows_with_index(all_parsers): def test_read_csv_dataframe(all_parsers, csv1): parser = all_parsers result = parser.read_csv(csv1, index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") expected = DataFrame( [ [0.980269, 3.685731, -0.364216805298, -1.159738], @@ -168,6 +164,7 @@ def test_read_csv_dataframe(all_parsers, csv1): datetime(2000, 1, 10), datetime(2000, 1, 11), ], + dtype="M8[s]", name="index", ), ) diff --git a/pandas/tests/io/parser/common/test_index.py b/pandas/tests/io/parser/common/test_index.py index 2fcc80f58ae30..4cfc12cdc46aa 100644 --- a/pandas/tests/io/parser/common/test_index.py +++ b/pandas/tests/io/parser/common/test_index.py @@ -260,7 +260,8 @@ def test_read_csv_no_index_name(all_parsers, csv_dir_path): datetime(2000, 1, 5), datetime(2000, 1, 6), datetime(2000, 1, 7), - ] + ], + dtype="M8[s]", ), ) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_multi_thread.py b/pandas/tests/io/parser/test_multi_thread.py index 649a1324686a7..348c19ac0f0c6 100644 --- a/pandas/tests/io/parser/test_multi_thread.py +++ b/pandas/tests/io/parser/test_multi_thread.py @@ -152,7 +152,8 @@ def test_multi_thread_path_multipart_read_csv(all_parsers): with tm.ensure_clean(file_name) as path: df.to_csv(path) - final_dataframe = _generate_multi_thread_dataframe( - parser, path, num_rows, num_tasks - ) - tm.assert_frame_equal(df, final_dataframe) + result = _generate_multi_thread_dataframe(parser, path, num_rows, num_tasks) + + expected = df[:] + expected["date"] = expected["date"].astype("M8[s]") + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py index 3bb3d793606e1..e9c6c0f5e32d7 100644 --- a/pandas/tests/io/parser/test_parse_dates.py +++ b/pandas/tests/io/parser/test_parse_dates.py @@ -62,6 +62,7 @@ def test_date_col_as_index_col(all_parsers): datetime(1999, 1, 27, 21, 0), datetime(1999, 1, 27, 22, 0), ], + dtype="M8[s]", name="X1", ) expected = DataFrame( @@ -90,7 +91,7 @@ def test_nat_parse(all_parsers): df = DataFrame( { "A": np.arange(10, dtype="float64"), - "B": Timestamp("20010101").as_unit("ns"), + "B": Timestamp("20010101"), } ) df.iloc[3:6, :] = np.nan @@ -126,7 +127,7 @@ def test_parse_dates_string(all_parsers): parser = all_parsers result = parser.read_csv(StringIO(data), index_col="date", parse_dates=["date"]) # freq doesn't round-trip - index = date_range("1/1/2009", periods=3, name="date")._with_freq(None) + index = date_range("1/1/2009", periods=3, name="date", unit="s")._with_freq(None) expected = DataFrame( {"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]}, index=index @@ -143,6 +144,8 @@ def test_parse_dates_column_list(all_parsers, parse_dates): expected = DataFrame( {"a": [datetime(2010, 1, 1)], "b": [1], "c": [datetime(2010, 2, 15)]} ) + expected["a"] = expected["a"].astype("M8[s]") + expected["c"] = expected["c"].astype("M8[s]") expected = expected.set_index(["a", "b"]) result = parser.read_csv( @@ -166,9 +169,10 @@ def test_multi_index_parse_dates(all_parsers, index_col): 20090103,three,c,4,5 """ parser = all_parsers + dti = date_range("2009-01-01", periods=3, freq="D", unit="s") index = MultiIndex.from_product( [ - (datetime(2009, 1, 1), datetime(2009, 1, 2), datetime(2009, 1, 3)), + dti, ("one", "two", "three"), ], names=["index1", "index2"], @@ -209,9 +213,6 @@ def test_parse_tz_aware(all_parsers): data = "Date,x\n2012-06-13T01:39:00Z,0.5" result = parser.read_csv(StringIO(data), index_col=0, parse_dates=True) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result.index = result.index.as_unit("ns") expected = DataFrame( {"x": [0.5]}, index=Index([Timestamp("2012-06-13 01:39:00+00:00")], name="Date") ) @@ -302,6 +303,7 @@ def test_parse_dates_empty_string(all_parsers): expected = DataFrame( [[datetime(2012, 1, 1), 1], [pd.NaT, 2]], columns=["Date", "test"] ) + expected["Date"] = expected["Date"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -312,18 +314,22 @@ def test_parse_dates_empty_string(all_parsers): ( "a\n04.15.2016", {"parse_dates": ["a"]}, - DataFrame([datetime(2016, 4, 15)], columns=["a"]), + DataFrame([datetime(2016, 4, 15)], columns=["a"], dtype="M8[s]"), ), ( "a\n04.15.2016", {"parse_dates": True, "index_col": 0}, - DataFrame(index=DatetimeIndex(["2016-04-15"], name="a"), columns=[]), + DataFrame( + index=DatetimeIndex(["2016-04-15"], dtype="M8[s]", name="a"), columns=[] + ), ), ( "a,b\n04.15.2016,09.16.2013", {"parse_dates": ["a", "b"]}, DataFrame( - [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], columns=["a", "b"] + [[datetime(2016, 4, 15), datetime(2013, 9, 16)]], + dtype="M8[s]", + columns=["a", "b"], ), ), ( @@ -331,7 +337,13 @@ def test_parse_dates_empty_string(all_parsers): {"parse_dates": True, "index_col": [0, 1]}, DataFrame( index=MultiIndex.from_tuples( - [(datetime(2016, 4, 15), datetime(2013, 9, 16))], names=["a", "b"] + [ + ( + Timestamp(2016, 4, 15).as_unit("s"), + Timestamp(2013, 9, 16).as_unit("s"), + ) + ], + names=["a", "b"], ), columns=[], ), @@ -399,6 +411,7 @@ def test_parse_timezone(all_parsers): end="2018-01-04 09:05:00", freq="1min", tz=timezone(timedelta(minutes=540)), + unit="s", )._with_freq(None) expected_data = {"dt": dti, "val": [23350, 23400, 23400, 23400, 23400]} @@ -437,7 +450,7 @@ def test_parse_delimited_date_swap_no_warning( all_parsers, date_string, dayfirst, expected, request ): parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + expected = DataFrame({0: [expected]}, dtype="datetime64[s]") if parser.engine == "pyarrow": if not dayfirst: # "CSV parse error: Empty CSV file or block" @@ -470,7 +483,7 @@ def test_parse_delimited_date_swap_with_warning( all_parsers, date_string, dayfirst, expected ): parser = all_parsers - expected = DataFrame({0: [expected]}, dtype="datetime64[ns]") + expected = DataFrame({0: [expected]}, dtype="datetime64[s]") warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " "Pass `dayfirst=.*` or specify a format to silence this warning." @@ -555,9 +568,7 @@ def test_date_parser_multiindex_columns(all_parsers): 1,2 2019-12-31,6""" result = parser.read_csv(StringIO(data), parse_dates=[("a", "1")], header=[0, 1]) - expected = DataFrame( - {("a", "1"): Timestamp("2019-12-31").as_unit("ns"), ("b", "2"): [6]} - ) + expected = DataFrame({("a", "1"): Timestamp("2019-12-31"), ("b", "2"): [6]}) tm.assert_frame_equal(result, expected) @@ -591,6 +602,7 @@ def test_date_parser_usecols_thousands(all_parsers): thousands="-", ) expected = DataFrame({"B": [3, 4], "C": [Timestamp("20-09-2001 01:00:00")] * 2}) + expected["C"] = expected["C"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -600,7 +612,7 @@ def test_dayfirst_warnings(): # CASE 1: valid input input = "date\n31/12/2014\n10/03/2011" expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None, name="date" + ["2014-12-31", "2011-03-10"], dtype="datetime64[s]", freq=None, name="date" ) warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " @@ -661,7 +673,7 @@ def test_dayfirst_warnings_no_leading_zero(date_string, dayfirst): # GH47880 initial_value = f"date\n{date_string}" expected = DatetimeIndex( - ["2014-01-31"], dtype="datetime64[ns]", freq=None, name="date" + ["2014-01-31"], dtype="datetime64[s]", freq=None, name="date" ) warning_msg = ( "Parsing dates in .* format when dayfirst=.* was specified. " @@ -716,7 +728,8 @@ def test_replace_nans_before_parsing_dates(all_parsers): pd.NaT, Timestamp("2017-09-09"), ] - } + }, + dtype="M8[s]", ) tm.assert_frame_equal(result, expected) @@ -731,6 +744,7 @@ def test_parse_dates_and_string_dtype(all_parsers): result = parser.read_csv(StringIO(data), dtype="string", parse_dates=["b"]) expected = DataFrame({"a": ["1"], "b": [Timestamp("2019-12-31")]}) expected["a"] = expected["a"].astype("string") + expected["b"] = expected["b"].astype("M8[s]") tm.assert_frame_equal(result, expected) @@ -750,7 +764,7 @@ def test_parse_dot_separated_dates(all_parsers): else: expected_index = DatetimeIndex( ["2003-03-27 14:55:00", "2003-08-03 15:20:00"], - dtype="datetime64[ns]", + dtype="datetime64[ms]", name="a", ) warn = UserWarning @@ -783,7 +797,8 @@ def test_parse_dates_dict_format(all_parsers): { "a": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], "b": [Timestamp("2019-12-31"), Timestamp("2020-12-31")], - } + }, + dtype="M8[s]", ) tm.assert_frame_equal(result, expected) @@ -816,9 +831,6 @@ def test_parse_dates_arrow_engine(all_parsers): 2000-01-01 00:00:01,1""" result = parser.read_csv(StringIO(data), parse_dates=["a"]) - # TODO: make unit check more specific - if parser.engine == "pyarrow": - result["a"] = result["a"].dt.as_unit("ns") expected = DataFrame( { "a": [ diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py index 0a9f6bd83e0d9..45d630c545565 100644 --- a/pandas/tests/io/parser/test_read_fwf.py +++ b/pandas/tests/io/parser/test_read_fwf.py @@ -298,7 +298,8 @@ def test_fwf_regression(): "2009-06-13 20:40:00", "2009-06-13 20:50:00", "2009-06-13 21:00:00", - ] + ], + dtype="M8[us]", ), columns=["SST", "T010", "T020", "T030", "T060", "T080", "T100"], ) @@ -311,6 +312,7 @@ def test_fwf_regression(): parse_dates=True, date_format="%Y%j%H%M%S", ) + expected.index = expected.index.astype("M8[s]") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_skiprows.py b/pandas/tests/io/parser/test_skiprows.py index 17a806d05fe28..99642ee4befc6 100644 --- a/pandas/tests/io/parser/test_skiprows.py +++ b/pandas/tests/io/parser/test_skiprows.py @@ -42,7 +42,9 @@ def test_skip_rows_bug(all_parsers, skiprows): StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True ) index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + dtype="M8[s]", + name=0, ) expected = DataFrame( @@ -85,7 +87,9 @@ def test_skip_rows_blank(all_parsers): StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True ) index = Index( - [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0 + [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], + dtype="M8[s]", + name=0, ) expected = DataFrame( diff --git a/pandas/tests/io/parser/usecols/test_parse_dates.py b/pandas/tests/io/parser/usecols/test_parse_dates.py index 0cf3fe894c916..cc54f2487aa60 100644 --- a/pandas/tests/io/parser/usecols/test_parse_dates.py +++ b/pandas/tests/io/parser/usecols/test_parse_dates.py @@ -70,7 +70,7 @@ def test_usecols_with_parse_dates3(all_parsers): parse_dates = [0] cols = { - "a": Timestamp("2016-09-21").as_unit("ns"), + "a": Timestamp("2016-09-21"), "b": [1], "c": [1], "d": [2], diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py index 471f7b8958ee4..3ce30e313cc30 100644 --- a/pandas/tests/io/pytables/test_store.py +++ b/pandas/tests/io/pytables/test_store.py @@ -613,10 +613,14 @@ def test_store_index_name(setup_path): @pytest.mark.parametrize("table_format", ["table", "fixed"]) def test_store_index_name_numpy_str(tmp_path, table_format, setup_path, unit, tz): # GH #13492 - idx = DatetimeIndex( - [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], - name="cols\u05d2", - ).tz_localize(tz) + idx = ( + DatetimeIndex( + [dt.date(2000, 1, 1), dt.date(2000, 1, 2)], + name="cols\u05d2", + ) + .tz_localize(tz) + .as_unit(unit) + ) idx1 = ( DatetimeIndex( [dt.date(2010, 1, 1), dt.date(2010, 1, 2)], diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index f6fb032b9d51a..c609ae999d47d 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -72,7 +72,9 @@ def test_read_csv(cleared_fs, df1): w.write(text) df2 = read_csv("memory://test/test.csv", parse_dates=["dt"]) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_reasonable_error(monkeypatch, cleared_fs): @@ -95,7 +97,9 @@ def test_to_csv(cleared_fs, df1): df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_to_excel(cleared_fs, df1): @@ -106,7 +110,9 @@ def test_to_excel(cleared_fs, df1): df2 = read_excel(path, parse_dates=["dt"], index_col=0) - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) @pytest.mark.parametrize("binary_mode", [False, True]) @@ -128,7 +134,9 @@ def test_to_csv_fsspec_object(cleared_fs, binary_mode, df1): ) assert not fsspec_object.closed - tm.assert_frame_equal(df1, df2) + expected = df1.copy() + expected["dt"] = expected["dt"].astype("M8[s]") + tm.assert_frame_equal(df2, expected) def test_csv_options(fsspectest): diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 4b2be41d0c9f9..17b89c9f31616 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -107,7 +107,11 @@ def from_uri(path): df1.to_markdown(path) df2 = df1 - tm.assert_frame_equal(df1, df2) + expected = df1[:] + if format in ["csv", "excel"]: + expected["dt"] = expected["dt"].dt.as_unit("s") + + tm.assert_frame_equal(df2, expected) def assert_equal_zip_safe(result: bytes, expected: bytes, compression: str): diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 594c1d02b94cc..dfc9b4156ecab 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -1044,11 +1044,15 @@ def test_header_inferred_from_rows_with_only_th(self, flavor_read_html): def test_parse_dates_list(self, flavor_read_html): df = DataFrame({"date": date_range("1/1/2001", periods=10)}) - expected = df.to_html() - res = flavor_read_html(StringIO(expected), parse_dates=[1], index_col=0) - tm.assert_frame_equal(df, res[0]) - res = flavor_read_html(StringIO(expected), parse_dates=["date"], index_col=0) - tm.assert_frame_equal(df, res[0]) + + expected = df[:] + expected["date"] = expected["date"].dt.as_unit("s") + + str_df = df.to_html() + res = flavor_read_html(StringIO(str_df), parse_dates=[1], index_col=0) + tm.assert_frame_equal(expected, res[0]) + res = flavor_read_html(StringIO(str_df), parse_dates=["date"], index_col=0) + tm.assert_frame_equal(expected, res[0]) def test_wikipedia_states_table(self, datapath, flavor_read_html): data = datapath("io", "data", "html", "wikipedia_states.html") diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index de6d46492e916..c7d9300c0a638 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -321,6 +321,8 @@ def test_orc_dtype_backend_pyarrow(): ], } ) + # FIXME: without casting to ns we do not round-trip correctly + df["datetime_with_nat"] = df["datetime_with_nat"].astype("M8[ns]") bytes_data = df.copy().to_orc() result = read_orc(BytesIO(bytes_data), dtype_backend="pyarrow") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 2860b3a6483af..35275f3c23bef 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -670,6 +670,7 @@ def test_read_empty_array(self, pa, dtype): class TestParquetPyArrow(Base): + @pytest.mark.xfail(reason="datetime_with_nat unit doesn't round-trip") def test_basic(self, pa, df_full): df = df_full pytest.importorskip("pyarrow", "11.0.0") @@ -706,6 +707,14 @@ def test_to_bytes_without_path_or_buf_provided(self, pa, df_full): expected = df_full.copy() expected.loc[1, "string_with_nan"] = None + if pa_version_under11p0: + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "M8[ns]" + ) + else: + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "M8[ms]" + ) tm.assert_frame_equal(res, expected) def test_duplicate_columns(self, pa): @@ -961,7 +970,11 @@ def test_timezone_aware_index(self, request, pa, timezone_aware_date_list): # they both implement datetime.tzinfo # they both wrap datetime.timedelta() # this use-case sets the resolution to 1 minute - check_round_trip(df, pa, check_dtype=False) + + expected = df[:] + if pa_version_under11p0: + expected.index = expected.index.as_unit("ns") + check_round_trip(df, pa, check_dtype=False, expected=expected) def test_filter_row_groups(self, pa): # https://github.com/pandas-dev/pandas/issues/26551 @@ -988,13 +1001,14 @@ def test_read_dtype_backend_pyarrow_config(self, pa, df_full): if pa_version_under13p0: # pyarrow infers datetimes as us instead of ns expected["datetime"] = expected["datetime"].astype("timestamp[us][pyarrow]") - expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( - "timestamp[us][pyarrow]" - ) expected["datetime_tz"] = expected["datetime_tz"].astype( pd.ArrowDtype(pyarrow.timestamp(unit="us", tz="Europe/Brussels")) ) + expected["datetime_with_nat"] = expected["datetime_with_nat"].astype( + "timestamp[ms][pyarrow]" + ) + check_round_trip( df, engine=pa, @@ -1018,6 +1032,7 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected=expected, ) + @pytest.mark.xfail(reason="pa.pandas_compat passes 'datetime64' to .astype") def test_columns_dtypes_not_invalid(self, pa): df = pd.DataFrame({"string": list("abc"), "int": list(range(1, 4))}) @@ -1107,9 +1122,11 @@ def test_infer_string_large_string_type(self, tmp_path, pa): # df.to_parquet(tmp_path / "test.parquet") # result = read_parquet(tmp_path / "test.parquet") # assert result["strings"].dtype == "string" + # FIXME: don't leave commented-out class TestParquetFastParquet(Base): + @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values") def test_basic(self, fp, df_full): df = df_full @@ -1254,6 +1271,25 @@ def test_error_on_using_partition_cols_and_partition_on( partition_cols=partition_cols, ) + def test_empty_dataframe(self, fp): + # GH #27339 + df = pd.DataFrame() + expected = df.copy() + check_round_trip(df, fp, expected=expected) + + @pytest.mark.xfail( + reason="fastparquet passed mismatched values/dtype to DatetimeArray " + "constructor, see https://github.com/dask/fastparquet/issues/891" + ) + def test_timezone_aware_index(self, fp, timezone_aware_date_list): + idx = 5 * [timezone_aware_date_list] + + df = pd.DataFrame(index=idx, data={"index_as_col": idx}) + + expected = df.copy() + expected.index.name = "index" + check_round_trip(df, fp, expected=expected) + def test_close_file_handle_on_read_error(self): with tm.ensure_clean("test.parquet") as path: pathlib.Path(path).write_bytes(b"breakit") diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6058f34d25ad3..df821fb740af8 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -19,10 +19,7 @@ import pytest from pandas._libs import lib -from pandas.compat import ( - pa_version_under13p0, - pa_version_under14p1, -) +from pandas.compat import pa_version_under14p1 from pandas.compat._optional import import_optional_dependency import pandas.util._test_decorators as td @@ -368,7 +365,7 @@ def create_and_load_postgres_datetz(conn): Timestamp("2000-01-01 08:00:00", tz="UTC"), Timestamp("2000-06-01 07:00:00", tz="UTC"), ] - return Series(expected_data, name="DateColWithTz") + return Series(expected_data, name="DateColWithTz").astype("M8[us, UTC]") def check_iris_frame(frame: DataFrame): @@ -1824,7 +1821,7 @@ def test_api_custom_dateparsing_error( pytest.mark.xfail(reason="failing combination of arguments") ) - expected = types_data_frame.astype({"DateCol": "datetime64[ns]"}) + expected = types_data_frame.astype({"DateCol": "datetime64[s]"}) result = read_sql( text, @@ -1847,10 +1844,12 @@ def test_api_custom_dateparsing_error( } ) - if not pa_version_under13p0: - # TODO: is this astype safe? - expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") - + if conn_name == "postgresql_adbc_types" and pa_version_under14p1: + expected["DateCol"] = expected["DateCol"].astype("datetime64[ns]") + elif "postgres" in conn_name or "mysql" in conn_name: + expected["DateCol"] = expected["DateCol"].astype("datetime64[us]") + else: + expected["DateCol"] = expected["DateCol"].astype("datetime64[s]") tm.assert_frame_equal(result, expected) @@ -2835,7 +2834,9 @@ def test_datetime_with_timezone_table(conn, request): conn = request.getfixturevalue(conn) expected = create_and_load_postgres_datetz(conn) result = sql.read_sql_table("datetz", conn) - tm.assert_frame_equal(result, expected.to_frame()) + + exp_frame = expected.to_frame() + tm.assert_frame_equal(result, exp_frame) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -2847,7 +2848,7 @@ def test_datetime_with_timezone_roundtrip(conn, request): # For dbs that support timestamps with timezones, should get back UTC # otherwise naive data should be returned expected = DataFrame( - {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific")} + {"A": date_range("2013-01-01 09:00:00", periods=3, tz="US/Pacific", unit="us")} ) assert expected.to_sql(name="test_datetime_tz", con=conn, index=False) == 3 @@ -2865,7 +2866,7 @@ def test_datetime_with_timezone_roundtrip(conn, request): if "sqlite" in conn_name: # read_sql_query does not return datetime type like read_sql_table assert isinstance(result.loc[0, "A"], str) - result["A"] = to_datetime(result["A"]) + result["A"] = to_datetime(result["A"]).dt.as_unit("us") tm.assert_frame_equal(result, expected) @@ -2876,7 +2877,9 @@ def test_out_of_bounds_datetime(conn, request): data = DataFrame({"date": datetime(9999, 1, 1)}, index=[0]) assert data.to_sql(name="test_datetime_obb", con=conn, index=False) == 1 result = sql.read_sql_table("test_datetime_obb", conn) - expected = DataFrame([pd.NaT], columns=["date"]) + expected = DataFrame( + np.array([datetime(9999, 1, 1)], dtype="M8[us]"), columns=["date"] + ) tm.assert_frame_equal(result, expected) @@ -2885,7 +2888,7 @@ def test_naive_datetimeindex_roundtrip(conn, request): # GH 23510 # Ensure that a naive DatetimeIndex isn't converted to UTC conn = request.getfixturevalue(conn) - dates = date_range("2018-01-01", periods=5, freq="6h")._with_freq(None) + dates = date_range("2018-01-01", periods=5, freq="6h", unit="us")._with_freq(None) expected = DataFrame({"nums": range(5)}, index=dates) assert expected.to_sql(name="foo_table", con=conn, index_label="info_date") == 5 result = sql.read_sql_table("foo_table", conn, index_col="info_date") @@ -2937,7 +2940,10 @@ def test_datetime(conn, request): # with read_table -> type information from schema used result = sql.read_sql_table("test_datetime", conn) result = result.drop("index", axis=1) - tm.assert_frame_equal(result, df) + + expected = df[:] + expected["A"] = expected["A"].astype("M8[us]") + tm.assert_frame_equal(result, expected) # with read_sql -> no type information -> sqlite has no native result = sql.read_sql_query("SELECT * FROM test_datetime", conn) @@ -2945,9 +2951,7 @@ def test_datetime(conn, request): if "sqlite" in conn_name: assert isinstance(result.loc[0, "A"], str) result["A"] = to_datetime(result["A"]) - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -2962,16 +2966,17 @@ def test_datetime_NaT(conn, request): # with read_table -> type information from schema used result = sql.read_sql_table("test_datetime", conn) - tm.assert_frame_equal(result, df) + expected = df[:] + expected["A"] = expected["A"].astype("M8[us]") + tm.assert_frame_equal(result, expected) # with read_sql -> no type information -> sqlite has no native result = sql.read_sql_query("SELECT * FROM test_datetime", conn) if "sqlite" in conn_name: assert isinstance(result.loc[0, "A"], str) result["A"] = to_datetime(result["A"], errors="coerce") - tm.assert_frame_equal(result, df) - else: - tm.assert_frame_equal(result, df) + + tm.assert_frame_equal(result, expected) @pytest.mark.parametrize("conn", sqlalchemy_connectable) @@ -3963,6 +3968,7 @@ def test_self_join_date_columns(postgresql_psycopg2_engine): expected = DataFrame( [[1, Timestamp("2021", tz="UTC")] * 2], columns=["id", "created_dt"] * 2 ) + expected["created_dt"] = expected["created_dt"].astype("M8[us, UTC]") tm.assert_frame_equal(result, expected) # Cleanup diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 2f981953a6237..d5134a3e3afd0 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -181,9 +181,7 @@ def test_read_dta2(self, datapath): expected["monthly_date"] = expected["monthly_date"].astype("M8[s]") expected["quarterly_date"] = expected["quarterly_date"].astype("M8[s]") expected["half_yearly_date"] = expected["half_yearly_date"].astype("M8[s]") - expected["yearly_date"] = ( - expected["yearly_date"].astype("Period[s]").array.view("M8[s]") - ) + expected["yearly_date"] = expected["yearly_date"].astype("M8[s]") path1 = datapath("io", "data", "stata", "stata2_114.dta") path2 = datapath("io", "data", "stata", "stata2_115.dta") @@ -206,9 +204,9 @@ def test_read_dta2(self, datapath): # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats # tm.assert_frame_equal(parsed_113, expected) - tm.assert_frame_equal(parsed_114, expected, check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_115, expected, check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_117, expected, check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_114, expected) + tm.assert_frame_equal(parsed_115, expected) + tm.assert_frame_equal(parsed_117, expected) @pytest.mark.parametrize( "file", ["stata3_113", "stata3_114", "stata3_115", "stata3_117"] @@ -952,8 +950,8 @@ def test_big_dates(self, datapath, temp_file): parsed_115 = read_stata(datapath("io", "data", "stata", "stata9_115.dta")) parsed_117 = read_stata(datapath("io", "data", "stata", "stata9_117.dta")) - tm.assert_frame_equal(expected, parsed_115, check_datetimelike_compat=True) - tm.assert_frame_equal(expected, parsed_117, check_datetimelike_compat=True) + tm.assert_frame_equal(expected, parsed_115) + tm.assert_frame_equal(expected, parsed_117) date_conversion = {c: c[-2:] for c in columns} # {c : c[-2:] for c in columns} @@ -965,7 +963,6 @@ def test_big_dates(self, datapath, temp_file): tm.assert_frame_equal( written_and_read_again.set_index("index"), expected.set_index(expected.index.astype(np.int32)), - check_datetimelike_compat=True, ) def test_dtype_conversion(self, datapath): @@ -1252,7 +1249,9 @@ def test_read_chunks_117( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True + from_frame, + chunk, + check_dtype=False, ) pos += chunksize @@ -1344,7 +1343,9 @@ def test_read_chunks_115( from_frame = parsed.iloc[pos : pos + chunksize, :].copy() from_frame = self._convert_categorical(from_frame) tm.assert_frame_equal( - from_frame, chunk, check_dtype=False, check_datetimelike_compat=True + from_frame, + chunk, + check_dtype=False, ) pos += chunksize diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 3428abacd509e..f4ea6b1d3f3de 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -557,7 +557,8 @@ def test_first_last_skipna(any_real_nullable_dtype, skipna, how): method = getattr(rs, how) result = method(skipna=skipna) - gb = df.groupby(df.shape[0] * [pd.to_datetime("2020-01-31")]) + ts = pd.to_datetime("2020-01-31").as_unit("ns") + gb = df.groupby(df.shape[0] * [ts]) expected = getattr(gb, how)(skipna=skipna) expected.index.freq = "ME" tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 5f5a54c4d92a3..2646106b9b97c 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -421,11 +421,13 @@ def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df) ) volume = [50, 50, 60] - week_starting = [ - Timestamp("2018-01-07"), - Timestamp("2018-01-18 01:00:00"), - Timestamp("2018-01-14"), - ] + week_starting = pd.DatetimeIndex( + [ + Timestamp("2018-01-07"), + Timestamp("2018-01-18 01:00:00"), + Timestamp("2018-01-14"), + ] + ).as_unit("ns") expected_ind = pd.MultiIndex.from_arrays( [volume, week_starting], names=["volume", "week_starting"], diff --git a/pandas/tests/reshape/concat/test_append_common.py b/pandas/tests/reshape/concat/test_append_common.py index c831cb8293943..afafe8f6ab264 100644 --- a/pandas/tests/reshape/concat/test_append_common.py +++ b/pandas/tests/reshape/concat/test_append_common.py @@ -19,12 +19,12 @@ "float64": [1.1, np.nan, 3.3], "category": Categorical(["X", "Y", "Z"]), "object": ["a", "b", "c"], - "datetime64[ns]": [ + "datetime64[s]": [ pd.Timestamp("2011-01-01"), pd.Timestamp("2011-01-02"), pd.Timestamp("2011-01-03"), ], - "datetime64[ns, US/Eastern]": [ + "datetime64[s, US/Eastern]": [ pd.Timestamp("2011-01-01", tz="US/Eastern"), pd.Timestamp("2011-01-02", tz="US/Eastern"), pd.Timestamp("2011-01-03", tz="US/Eastern"), diff --git a/pandas/tests/reshape/concat/test_datetimes.py b/pandas/tests/reshape/concat/test_datetimes.py index 3e046b2df72d8..89a3c3c5ed8bc 100644 --- a/pandas/tests/reshape/concat/test_datetimes.py +++ b/pandas/tests/reshape/concat/test_datetimes.py @@ -213,7 +213,7 @@ def test_concat_NaT_dataframes(self, tz): @pytest.mark.parametrize("tz1", [None, "UTC"]) @pytest.mark.parametrize("tz2", [None, "UTC"]) - @pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101")]) + @pytest.mark.parametrize("item", [pd.NaT, Timestamp("20150101").as_unit("ns")]) def test_concat_NaT_dataframes_all_NaT_axis_0(self, tz1, tz2, item): # GH 12396 @@ -358,7 +358,7 @@ def test_concat_tz_series_tzlocal(self): result = concat([Series(x), Series(y)], ignore_index=True) tm.assert_series_equal(result, Series(x + y)) - assert result.dtype == "datetime64[ns, tzlocal()]" + assert result.dtype == "datetime64[s, tzlocal()]" def test_concat_tz_series_with_datetimelike(self): # see gh-12620: tz and timedelta diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 340c5c449aea7..d8bb4fba1e1fe 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -1,3 +1,5 @@ +from datetime import datetime + import numpy as np import pytest @@ -445,10 +447,16 @@ def test_datetime_bin(conv): Interval(Timestamp(bin_data[1]), Timestamp(bin_data[2])), ] ) - ).astype(CategoricalDtype(ordered=True)) + ) bins = [conv(v) for v in bin_data] result = Series(cut(data, bins=bins)) + + if type(bins[0]) is datetime: + # The bins have microsecond dtype -> so does result + expected = expected.astype("interval[datetime64[us]]") + + expected = expected.astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) @@ -461,10 +469,6 @@ def test_datetime_cut(unit, box): data = box(data) result, _ = cut(data, 3, retbins=True) - if box is list: - # We don't (yet) do inference on these, so get nanos - unit = "ns" - if unit == "s": # See https://github.com/pandas-dev/pandas/pull/56101#discussion_r1405325425 # for why we round to 8 seconds instead of 7 @@ -531,24 +535,26 @@ def test_datetime_tz_cut(bins, box): bins = box(bins) result = cut(ser, bins) - expected = Series( - IntervalIndex( - [ - Interval( - Timestamp("2012-12-31 23:57:07.200000", tz=tz), - Timestamp("2013-01-01 16:00:00", tz=tz), - ), - Interval( - Timestamp("2013-01-01 16:00:00", tz=tz), - Timestamp("2013-01-02 08:00:00", tz=tz), - ), - Interval( - Timestamp("2013-01-02 08:00:00", tz=tz), - Timestamp("2013-01-03 00:00:00", tz=tz), - ), - ] - ) - ).astype(CategoricalDtype(ordered=True)) + ii = IntervalIndex( + [ + Interval( + Timestamp("2012-12-31 23:57:07.200000", tz=tz), + Timestamp("2013-01-01 16:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-01 16:00:00", tz=tz), + Timestamp("2013-01-02 08:00:00", tz=tz), + ), + Interval( + Timestamp("2013-01-02 08:00:00", tz=tz), + Timestamp("2013-01-03 00:00:00", tz=tz), + ), + ] + ) + if isinstance(bins, int): + # the dtype is inferred from ser, which has nanosecond unit + ii = ii.astype("interval[datetime64[ns, US/Eastern]]") + expected = Series(ii).astype(CategoricalDtype(ordered=True)) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/test_qcut.py b/pandas/tests/reshape/test_qcut.py index 53af673e0f7b0..5f769db7f8acf 100644 --- a/pandas/tests/reshape/test_qcut.py +++ b/pandas/tests/reshape/test_qcut.py @@ -271,8 +271,10 @@ def test_datetime_tz_qcut(bins): ], ], ) -def test_date_like_qcut_bins(arg, expected_bins): +def test_date_like_qcut_bins(arg, expected_bins, unit): # see gh-19891 + arg = arg.as_unit(unit) + expected_bins = expected_bins.as_unit(unit) ser = Series(arg) result, result_bins = qcut(ser, 2, retbins=True) tm.assert_index_equal(result_bins, expected_bins) diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py index e352e2601cef3..131be7a77f2e5 100644 --- a/pandas/tests/scalar/test_nat.py +++ b/pandas/tests/scalar/test_nat.py @@ -439,8 +439,10 @@ def test_nat_rfloordiv_timedelta(val, expected): @pytest.mark.parametrize( "value", [ - DatetimeIndex(["2011-01-01", "2011-01-02"], name="x"), - DatetimeIndex(["2011-01-01", "2011-01-02"], tz="US/Eastern", name="x"), + DatetimeIndex(["2011-01-01", "2011-01-02"], dtype="M8[ns]", name="x"), + DatetimeIndex( + ["2011-01-01", "2011-01-02"], dtype="M8[ns, US/Eastern]", name="x" + ), DatetimeArray._from_sequence(["2011-01-01", "2011-01-02"], dtype="M8[ns]"), DatetimeArray._from_sequence( ["2011-01-01", "2011-01-02"], dtype=DatetimeTZDtype(tz="US/Pacific") diff --git a/pandas/tests/series/methods/test_combine_first.py b/pandas/tests/series/methods/test_combine_first.py index 0f2f533c8feff..293919173c2d5 100644 --- a/pandas/tests/series/methods/test_combine_first.py +++ b/pandas/tests/series/methods/test_combine_first.py @@ -78,7 +78,7 @@ def test_combine_first_dt64(self, unit): s1 = Series([np.nan, "2011"]) rs = s0.combine_first(s1) - xp = Series([datetime(2010, 1, 1), "2011"], dtype="datetime64[ns]") + xp = Series([datetime(2010, 1, 1), "2011"], dtype=f"datetime64[{unit}]") tm.assert_series_equal(rs, xp) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 592dba253532d..c10bb8278a3d1 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -411,7 +411,7 @@ def test_datetime64_tz_fillna(self, tz, unit): Timestamp("2011-01-02 10:00", tz=tz), Timestamp("2011-01-03 10:00"), Timestamp("2011-01-02 10:00", tz=tz), - ] + ], ) tm.assert_series_equal(expected, result) tm.assert_series_equal(isna(ser), null_loc) diff --git a/pandas/tests/series/methods/test_to_csv.py b/pandas/tests/series/methods/test_to_csv.py index f7dec02ab0e5b..488d0cb9fe9da 100644 --- a/pandas/tests/series/methods/test_to_csv.py +++ b/pandas/tests/series/methods/test_to_csv.py @@ -31,7 +31,9 @@ def test_from_csv(self, datetime_series, string_series, temp_file): path = temp_file datetime_series.to_csv(path, header=False) ts = self.read_csv(path, parse_dates=True) - tm.assert_series_equal(datetime_series, ts, check_names=False) + expected = datetime_series.copy() + expected.index = expected.index.as_unit("s") + tm.assert_series_equal(expected, ts, check_names=False) assert ts.name is None assert ts.index.name is None @@ -57,6 +59,7 @@ def test_from_csv(self, datetime_series, string_series, temp_file): series = self.read_csv(path, sep="|", parse_dates=True) check_series = Series({datetime(1998, 1, 1): 1.0, datetime(1999, 1, 1): 2.0}) + check_series.index = check_series.index.as_unit("s") tm.assert_series_equal(check_series, series) series = self.read_csv(path, sep="|", parse_dates=False) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 3f9d5bbe806bb..00c614cf72c20 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -752,7 +752,7 @@ def test_constructor_pass_nan_nat(self): tm.assert_series_equal(Series(np.array([np.nan, np.nan])), exp) exp = Series([NaT, NaT]) - assert exp.dtype == "datetime64[ns]" + assert exp.dtype == "datetime64[s]" tm.assert_series_equal(Series([NaT, NaT]), exp) tm.assert_series_equal(Series(np.array([NaT, NaT])), exp) @@ -934,7 +934,7 @@ def test_constructor_datetimes_with_nulls(self): np.array([None, None, datetime.now(), None]), ]: result = Series(arr) - assert result.dtype == "M8[ns]" + assert result.dtype == "M8[us]" def test_constructor_dtype_datetime64(self): s = Series(iNaT, dtype="M8[ns]", index=range(5)) @@ -962,15 +962,15 @@ def test_constructor_dtype_datetime64_10(self): dates = [np.datetime64(x) for x in pydates] ser = Series(dates) - assert ser.dtype == "M8[ns]" + assert ser.dtype == "M8[us]" ser.iloc[0] = np.nan - assert ser.dtype == "M8[ns]" + assert ser.dtype == "M8[us]" # GH3414 related expected = Series(pydates, dtype="datetime64[ms]") - result = Series(Series(dates).astype(np.int64) / 1000000, dtype="M8[ms]") + result = Series(Series(dates).astype(np.int64) / 1000, dtype="M8[ms]") tm.assert_series_equal(result, expected) result = Series(dates, dtype="datetime64[ms]") @@ -1084,16 +1084,16 @@ def test_constructor_dtype_datetime64_4(self): def test_constructor_dtype_datetime64_3(self): # if we passed a NaT it remains ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), NaT]) - assert ser.dtype == "object" + assert ser.dtype == "M8[us]" assert ser[2] is NaT assert "NaT" in str(ser) def test_constructor_dtype_datetime64_2(self): # if we passed a nan it remains ser = Series([datetime(2010, 1, 1), datetime(2, 1, 1), np.nan]) - assert ser.dtype == "object" - assert ser[2] is np.nan - assert "NaN" in str(ser) + assert ser.dtype == "M8[us]" + assert ser[2] is NaT + assert "NaT" in str(ser) def test_constructor_with_datetime_tz(self): # 8260 @@ -1155,7 +1155,7 @@ def test_constructor_with_datetime_tz4(self): Timestamp("2013-01-02 14:00:00-0800", tz="US/Pacific"), ] ) - assert ser.dtype == "datetime64[ns, US/Pacific]" + assert ser.dtype == "datetime64[s, US/Pacific]" assert lib.infer_dtype(ser, skipna=True) == "datetime64" def test_constructor_with_datetime_tz3(self): @@ -1215,7 +1215,7 @@ def test_construction_to_datetimelike_unit(self, arr_dtype, kind, unit): def test_constructor_with_naive_string_and_datetimetz_dtype(self, arg): # GH 17415: With naive string result = Series([arg], dtype="datetime64[ns, CET]") - expected = Series(Timestamp(arg)).dt.tz_localize("CET") + expected = Series([Timestamp(arg)], dtype="M8[ns]").dt.tz_localize("CET") tm.assert_series_equal(result, expected) def test_constructor_datetime64_bigendian(self): @@ -1356,14 +1356,8 @@ def test_constructor_dict_order(self): expected = Series([1, 0, 2], index=list("bac")) tm.assert_series_equal(result, expected) - def test_constructor_dict_extension(self, ea_scalar_and_dtype, request): + def test_constructor_dict_extension(self, ea_scalar_and_dtype): ea_scalar, ea_dtype = ea_scalar_and_dtype - if isinstance(ea_scalar, Timestamp): - mark = pytest.mark.xfail( - reason="Construction from dict goes through " - "maybe_convert_objects which casts to nano" - ) - request.applymarker(mark) d = {"a": ea_scalar} result = Series(d, index=["a"]) expected = Series(ea_scalar, index=["a"], dtype=ea_dtype) @@ -1408,7 +1402,9 @@ def create_data(constructor): result_Timestamp = Series(data_Timestamp) tm.assert_series_equal(result_datetime64, expected) - tm.assert_series_equal(result_datetime, expected) + tm.assert_series_equal( + result_datetime, expected.set_axis(expected.index.as_unit("us")) + ) tm.assert_series_equal(result_Timestamp, expected) def test_constructor_dict_tuple_indexer(self): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 6da6ad27f853f..134ebededd163 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1264,6 +1264,7 @@ def test_value_counts_datetime_outofbounds(self, dtype): ], dtype=dtype, ) + res = ser.value_counts() exp_index = Index( diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b05c30fa50fbe..cbbd018720bad 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -117,7 +117,9 @@ def test_to_datetime_format_YYYYMMDD_with_nat(self, cache): ser = Series([19801222, 19801222] + [19810105] * 5, dtype="float") # with NaT expected = Series( - [Timestamp("19801222"), Timestamp("19801222")] + [Timestamp("19810105")] * 5 + [Timestamp("19801222"), Timestamp("19801222")] + + [Timestamp("19810105")] * 5, + dtype="M8[s]", ) expected[2] = np.nan ser[2] = np.nan @@ -143,19 +145,32 @@ def test_to_datetime_format_YYYYMM_with_nat(self, cache): # Explicit cast to float to explicit cast when setting np.nan ser = Series([198012, 198012] + [198101] * 5, dtype="float") expected = Series( - [Timestamp("19801201"), Timestamp("19801201")] + [Timestamp("19810101")] * 5 + [Timestamp("19801201"), Timestamp("19801201")] + + [Timestamp("19810101")] * 5, + dtype="M8[s]", ) expected[2] = np.nan ser[2] = np.nan result = to_datetime(ser, format="%Y%m", cache=cache) tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_oob_for_ns(self, cache): + # coercion + # GH 7930, GH 14487 + ser = Series([20121231, 20141231, 99991231]) + result = to_datetime(ser, format="%Y%m%d", errors="raise", cache=cache) + expected = Series( + np.array(["2012-12-31", "2014-12-31", "9999-12-31"], dtype="M8[s]"), + dtype="M8[s]", + ) + tm.assert_series_equal(result, expected) + def test_to_datetime_format_YYYYMMDD_coercion(self, cache): # coercion # GH 7930 - ser = Series([20121231, 20141231, 99991231]) + ser = Series([20121231, 20141231, 999999999999999999999999999991231]) result = to_datetime(ser, format="%Y%m%d", errors="coerce", cache=cache) - expected = Series(["20121231", "20141231", "NaT"], dtype="M8[ns]") + expected = Series(["20121231", "20141231", "NaT"], dtype="M8[s]") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -532,7 +547,8 @@ def test_to_datetime_overflow(self): res = to_datetime(arg, errors="coerce") assert res is NaT res = to_datetime([arg], errors="coerce") - tm.assert_index_equal(res, Index([NaT])) + exp = Index([NaT], dtype="M8[s]") + tm.assert_index_equal(res, exp) def test_to_datetime_mixed_datetime_and_string(self): # GH#47018 adapted old doctest with new behavior @@ -563,7 +579,7 @@ def test_to_datetime_mixed_date_and_string(self, format): # https://github.com/pandas-dev/pandas/issues/50108 d1 = date(2020, 1, 2) res = to_datetime(["2020-01-01", d1], format=format) - expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[ns]") + expected = DatetimeIndex(["2020-01-01", "2020-01-02"], dtype="M8[s]") tm.assert_index_equal(res, expected) @pytest.mark.parametrize( @@ -579,7 +595,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00-08:00"], DatetimeIndex( ["2000-01-01 09:00:00+00:00", "2000-01-01 10:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="all tz-aware, with utc", ), @@ -588,7 +604,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], - ), + ).as_unit("us"), id="all tz-aware, without utc", ), pytest.param( @@ -596,7 +612,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00-08:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 09:00:00+00:00", "2000-01-01 02:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="all tz-aware, mixed offsets, with utc", ), @@ -605,7 +621,7 @@ def test_to_datetime_mixed_date_and_string(self, format): ["2000-01-01 01:00:00", "2000-01-01 02:00:00+00:00"], DatetimeIndex( ["2000-01-01 01:00:00+00:00", "2000-01-01 02:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", ), id="tz-aware string, naive pydatetime, with utc", ), @@ -625,6 +641,8 @@ def test_to_datetime_mixed_datetime_and_string_with_format( ts1 = constructor(args[0]) ts2 = args[1] result = to_datetime([ts1, ts2], format=fmt, utc=utc) + if constructor is Timestamp: + expected = expected.as_unit("s") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -696,7 +714,7 @@ def test_to_datetime_mixed_offsets_with_none_tz_utc_false_removed( "%Y-%m-%d %H:%M:%S%z", DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-01-02 00:00:00+00:00", "NaT"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[s, UTC]", ), id="ISO8601, UTC", ), @@ -704,7 +722,7 @@ def test_to_datetime_mixed_offsets_with_none_tz_utc_false_removed( "%Y-%d-%m %H:%M:%S%z", DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-02-01 00:00:00+00:00", "NaT"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[s, UTC]", ), id="non-ISO8601, UTC", ), @@ -965,7 +983,7 @@ def test_to_datetime_now(self): # See GH#18666 with tm.set_timezone("US/Eastern"): # GH#18705 - now = Timestamp("now").as_unit("ns") + now = Timestamp("now") pdnow = to_datetime("now") pdnow2 = to_datetime(["now"])[0] @@ -988,12 +1006,12 @@ def test_to_datetime_today(self, tz): # this both of these timezones _and_ UTC will all be in the same day, # so this test will not detect the regression introduced in #18666. with tm.set_timezone(tz): - nptoday = np.datetime64("today").astype("datetime64[ns]").astype(np.int64) + nptoday = np.datetime64("today").astype("datetime64[us]").astype(np.int64) pdtoday = to_datetime("today") pdtoday2 = to_datetime(["today"])[0] - tstoday = Timestamp("today").as_unit("ns") - tstoday2 = Timestamp.today().as_unit("ns") + tstoday = Timestamp("today") + tstoday2 = Timestamp.today() # These should all be equal with infinite perf; this gives # a generous margin of 10 seconds @@ -1030,7 +1048,7 @@ def test_to_datetime_now_with_format(self, format, expected_ds, string, attribut # https://github.com/pandas-dev/pandas/issues/50359 result = to_datetime(["2020-01-03 00:00:00Z", string], format=format, utc=True) expected = DatetimeIndex( - [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[ns, UTC]" + [expected_ds, getattr(Timestamp, attribute)()], dtype="datetime64[s, UTC]" ) assert (expected - result).max().total_seconds() < 1 @@ -1091,11 +1109,7 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): # Assuming all datetimes are in bounds, to_datetime() returns # an array that is equal to Timestamp() parsing result = to_datetime(dts, cache=cache) - if cache: - # FIXME: behavior should not depend on cache - expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]") - else: - expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[ns]") + expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]") tm.assert_index_equal(result, expected) @@ -1106,14 +1120,7 @@ def test_to_datetime_array_of_dt64s(self, cache, unit): to_datetime(dts_with_oob, errors="raise") result = to_datetime(dts_with_oob, errors="coerce", cache=cache) - if not cache: - # FIXME: shouldn't depend on cache! - expected = DatetimeIndex( - [Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8] * 30 - + [NaT], - ) - else: - expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]")) + expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]")) tm.assert_index_equal(result, expected) def test_to_datetime_tz(self, cache): @@ -1126,7 +1133,7 @@ def test_to_datetime_tz(self, cache): result = to_datetime(arr, cache=cache) expected = DatetimeIndex( ["2013-01-01 13:00:00", "2013-01-02 14:00:00"], tz="US/Pacific" - ) + ).as_unit("s") tm.assert_index_equal(result, expected) def test_to_datetime_tz_mixed(self, cache): @@ -1145,7 +1152,7 @@ def test_to_datetime_tz_mixed(self, cache): result = to_datetime(arr, cache=cache, errors="coerce") expected = DatetimeIndex( - ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[ns, US/Pacific]" + ["2013-01-01 13:00:00-08:00", "NaT"], dtype="datetime64[s, US/Pacific]" ) tm.assert_index_equal(result, expected) @@ -1177,7 +1184,7 @@ def test_to_datetime_tz_pytz(self, cache): result = to_datetime(arr, utc=True, cache=cache) expected = DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", freq=None, ) tm.assert_index_equal(result, expected) @@ -1264,7 +1271,7 @@ def test_to_datetime_tz_psycopg2(self, request, cache): result = to_datetime(arr, errors="coerce", utc=True, cache=cache) expected = DatetimeIndex( ["2000-01-01 08:00:00+00:00", "2000-06-01 07:00:00+00:00"], - dtype="datetime64[ns, UTC]", + dtype="datetime64[us, UTC]", freq=None, ) tm.assert_index_equal(result, expected) @@ -1273,15 +1280,15 @@ def test_to_datetime_tz_psycopg2(self, request, cache): i = DatetimeIndex( ["2000-01-01 08:00:00"], tz=psycopg2_tz.FixedOffsetTimezone(offset=-300, name=None), - ) - assert is_datetime64_ns_dtype(i) + ).as_unit("us") + assert not is_datetime64_ns_dtype(i) # tz coercion result = to_datetime(i, errors="coerce", cache=cache) tm.assert_index_equal(result, i) result = to_datetime(i, errors="coerce", utc=True, cache=cache) - expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[ns, UTC]") + expected = DatetimeIndex(["2000-01-01 13:00:00"], dtype="datetime64[us, UTC]") tm.assert_index_equal(result, expected) @pytest.mark.parametrize("arg", [True, False]) @@ -1351,16 +1358,20 @@ def test_datetime_invalid_scalar(self, value, format): def test_datetime_outofbounds_scalar(self, value, format): # GH24763 res = to_datetime(value, errors="coerce", format=format) - assert res is NaT + if format is None: + assert isinstance(res, Timestamp) + assert res == Timestamp(value) + else: + assert res is NaT if format is not None: msg = r'^time data ".*" doesn\'t match format ".*", at position 0.' with pytest.raises(ValueError, match=msg): to_datetime(value, errors="raise", format=format) else: - msg = "^Out of bounds .*, at position 0$" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(value, errors="raise", format=format) + res = to_datetime(value, errors="raise", format=format) + assert isinstance(res, Timestamp) + assert res == Timestamp(value) @pytest.mark.parametrize( ("values"), [(["a"]), (["00:01:99"]), (["a", "b", "99:00:00"])] @@ -1433,15 +1444,17 @@ def test_to_datetime_cache_scalar(self): assert result == expected @pytest.mark.parametrize( - "datetimelikes,expected_values", + "datetimelikes,expected_values,exp_unit", ( ( (None, np.nan) + (NaT,) * start_caching_at, (NaT,) * (start_caching_at + 2), + "s", ), ( (None, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, (NaT, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, + "s", ), ( (None,) @@ -1449,11 +1462,12 @@ def test_to_datetime_cache_scalar(self): + ("2012 July 26", Timestamp("2012-07-26")), (NaT,) * (start_caching_at + 1) + (Timestamp("2012-07-26"), Timestamp("2012-07-26")), + "s", ), ), ) def test_convert_object_to_datetime_with_cache( - self, datetimelikes, expected_values + self, datetimelikes, expected_values, exp_unit ): # GH#39882 ser = Series( @@ -1463,7 +1477,7 @@ def test_convert_object_to_datetime_with_cache( result_series = to_datetime(ser, errors="coerce") expected_series = Series( expected_values, - dtype="datetime64[ns]", + dtype=f"datetime64[{exp_unit}]", ) tm.assert_series_equal(result_series, expected_series) @@ -1484,7 +1498,7 @@ def test_convert_object_to_datetime_with_cache( ) def test_to_datetime_converts_null_like_to_nat(self, cache, input): # GH35888 - expected = Series([NaT] * len(input), dtype="M8[ns]") + expected = Series([NaT] * len(input), dtype="M8[s]") result = to_datetime(input, cache=cache) tm.assert_series_equal(result, expected) @@ -1535,7 +1549,17 @@ def test_to_datetime_coerce_oob(self, string_arg, format, outofbounds): # https://github.com/pandas-dev/pandas/issues/50255 ts_strings = [string_arg, outofbounds] result = to_datetime(ts_strings, errors="coerce", format=format) - expected = DatetimeIndex([datetime(2018, 3, 1), NaT]) + if isinstance(outofbounds, str) and ( + format.startswith("%B") ^ outofbounds.startswith("J") + ): + # the strings don't match the given format, so they raise and we coerce + expected = DatetimeIndex([datetime(2018, 3, 1), NaT], dtype="M8[s]") + elif isinstance(outofbounds, datetime): + expected = DatetimeIndex( + [datetime(2018, 3, 1), outofbounds], dtype="M8[us]" + ) + else: + expected = DatetimeIndex([datetime(2018, 3, 1), outofbounds], dtype="M8[s]") tm.assert_index_equal(result, expected) def test_to_datetime_malformed_no_raise(self): @@ -1546,7 +1570,9 @@ def test_to_datetime_malformed_no_raise(self): UserWarning, match="Could not infer format", raise_on_extra_warnings=False ): result = to_datetime(ts_strings, errors="coerce") - tm.assert_index_equal(result, Index([NaT, NaT])) + # TODO: should Index get "s" by default here? + exp = Index([NaT, NaT], dtype="M8[s]") + tm.assert_index_equal(result, exp) def test_to_datetime_malformed_raise(self): # GH 48633 @@ -1594,7 +1620,7 @@ def test_iso_8601_strings_with_different_offsets_utc(self): result = to_datetime(ts_strings, utc=True) expected = DatetimeIndex( [Timestamp(2015, 11, 18, 10), Timestamp(2015, 11, 18, 10), NaT], tz="UTC" - ) + ).as_unit("s") tm.assert_index_equal(result, expected) def test_mixed_offsets_with_native_datetime_utc_false_raises(self): @@ -1620,7 +1646,7 @@ def test_non_iso_strings_with_tz_offset(self): result = to_datetime(["March 1, 2018 12:00:00+0400"] * 2) expected = DatetimeIndex( [datetime(2018, 3, 1, 12, tzinfo=timezone(timedelta(minutes=240)))] * 2 - ) + ).as_unit("s") tm.assert_index_equal(result, expected) @pytest.mark.parametrize( @@ -1641,9 +1667,11 @@ def test_timestamp_utc_true(self, ts, expected): @pytest.mark.parametrize("dt_str", ["00010101", "13000101", "30000101", "99990101"]) def test_to_datetime_with_format_out_of_bounds(self, dt_str): # GH 9107 - msg = "Out of bounds nanosecond timestamp" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime(dt_str, format="%Y%m%d") + res = to_datetime(dt_str, format="%Y%m%d") + dtobj = datetime.strptime(dt_str, "%Y%m%d") + expected = Timestamp(dtobj).as_unit("s") + assert res == expected + assert res.unit == expected.unit def test_to_datetime_utc(self): arr = np.array([parse("2012-06-13T01:39:00Z")], dtype=object) @@ -1726,7 +1754,7 @@ def test_to_datetime_month_or_year_unit_non_round_float(self, cache, unit): # In 3.0, the string "1.5" is parsed as as it would be without unit, # which fails. With errors="coerce" this becomes NaT. res = to_datetime(["1.5"], unit=unit, errors="coerce") - expected = to_datetime([NaT]) + expected = to_datetime([NaT]).as_unit("ns") tm.assert_index_equal(res, expected) # round floats are OK @@ -2149,7 +2177,7 @@ def test_dataframe_utc_true(self): df = DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}) result = to_datetime(df, utc=True) expected = Series( - np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[ns]") + np.array(["2015-02-04", "2016-03-05"], dtype="datetime64[s]") ).dt.tz_localize("UTC") tm.assert_series_equal(result, expected) @@ -2361,7 +2389,9 @@ def test_to_datetime_with_space_in_series(self, cache): with pytest.raises(ValueError, match=msg): to_datetime(ser, errors="raise", cache=cache) result_coerce = to_datetime(ser, errors="coerce", cache=cache) - expected_coerce = Series([datetime(2006, 10, 18), datetime(2008, 10, 18), NaT]) + expected_coerce = Series( + [datetime(2006, 10, 18), datetime(2008, 10, 18), NaT] + ).dt.as_unit("s") tm.assert_series_equal(result_coerce, expected_coerce) @td.skip_if_not_us_locale @@ -2473,7 +2503,7 @@ def test_string_na_nat_conversion(self, cache): strings = np.array(["1/1/2000", "1/2/2000", np.nan, "1/4/2000"], dtype=object) - expected = np.empty(4, dtype="M8[ns]") + expected = np.empty(4, dtype="M8[s]") for i, val in enumerate(strings): if isna(val): expected[i] = iNaT @@ -2518,7 +2548,7 @@ def test_string_na_nat_conversion_with_name(self, cache): result = to_datetime(series, cache=cache) dresult = to_datetime(dseries, cache=cache) - expected = Series(np.empty(5, dtype="M8[ns]"), index=idx) + expected = Series(np.empty(5, dtype="M8[s]"), index=idx) for i in range(5): x = series.iloc[i] if isna(x): @@ -2558,7 +2588,7 @@ def test_dayfirst(self, cache): arr = ["10/02/2014", "11/02/2014", "12/02/2014"] expected = DatetimeIndex( [datetime(2014, 2, 10), datetime(2014, 2, 11), datetime(2014, 2, 12)] - ) + ).as_unit("s") idx1 = DatetimeIndex(arr, dayfirst=True) idx2 = DatetimeIndex(np.array(arr), dayfirst=True) idx3 = to_datetime(arr, dayfirst=True, cache=cache) @@ -2582,7 +2612,7 @@ def test_dayfirst_warnings_valid_input(self): # CASE 1: valid input arr = ["31/12/2014", "10/03/2011"] expected = DatetimeIndex( - ["2014-12-31", "2011-03-10"], dtype="datetime64[ns]", freq=None + ["2014-12-31", "2011-03-10"], dtype="datetime64[s]", freq=None ) # A. dayfirst arg correct, no warning @@ -2687,7 +2717,7 @@ def test_to_datetime_consistent_format(self, cache): ser = Series(np.array(data)) result = to_datetime(ser, cache=cache) expected = Series( - ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[ns]" + ["2011-01-01", "2011-02-01", "2011-03-01"], dtype="datetime64[s]" ) tm.assert_series_equal(result, expected) @@ -2699,9 +2729,7 @@ def test_to_datetime_series_with_nans(self, cache): ) ) result = to_datetime(ser, cache=cache) - expected = Series( - ["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[ns]" - ) + expected = Series(["2011-01-01", NaT, "2011-01-03", NaT], dtype="datetime64[s]") tm.assert_series_equal(result, expected) def test_to_datetime_series_start_with_nans(self, cache): @@ -2720,7 +2748,7 @@ def test_to_datetime_series_start_with_nans(self, cache): result = to_datetime(ser, cache=cache) expected = Series( - [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[ns]" + [NaT, NaT, "2011-01-01", "2011-01-02", "2011-01-03"], dtype="datetime64[s]" ) tm.assert_series_equal(result, expected) @@ -2734,6 +2762,7 @@ def test_infer_datetime_format_tz_name(self, tz_name, offset): result = to_datetime(ser) tz = timezone(timedelta(minutes=offset)) expected = Series([Timestamp("2019-02-02 08:07:13").tz_localize(tz)]) + expected = expected.dt.as_unit("s") tm.assert_series_equal(result, expected) @pytest.mark.parametrize( @@ -2890,9 +2919,16 @@ def test_parsers(self, date_str, expected, cache): # https://github.com/dateutil/dateutil/issues/217 yearfirst = True - result1, _ = parsing.parse_datetime_string_with_reso( + result1, reso_attrname = parsing.parse_datetime_string_with_reso( date_str, yearfirst=yearfirst ) + + reso = { + "nanosecond": "ns", + "microsecond": "us", + "millisecond": "ms", + "second": "s", + }.get(reso_attrname, "s") result2 = to_datetime(date_str, yearfirst=yearfirst) result3 = to_datetime([date_str], yearfirst=yearfirst) # result5 is used below @@ -2907,7 +2943,7 @@ def test_parsers(self, date_str, expected, cache): for res in [result1, result2]: assert res == expected for res in [result3, result4, result6, result8, result9]: - exp = DatetimeIndex([Timestamp(expected)]) + exp = DatetimeIndex([Timestamp(expected)]).as_unit(reso) tm.assert_index_equal(res, exp) # these really need to have yearfirst, but we don't support @@ -2921,7 +2957,7 @@ def test_na_values_with_cache( self, cache, unique_nulls_fixture, unique_nulls_fixture2 ): # GH22305 - expected = Index([NaT, NaT], dtype="datetime64[ns]") + expected = Index([NaT, NaT], dtype="datetime64[s]") result = to_datetime([unique_nulls_fixture, unique_nulls_fixture2], cache=cache) tm.assert_index_equal(result, expected) @@ -3197,9 +3233,16 @@ def test_incorrect_value_exception(self): ) def test_to_datetime_out_of_bounds_with_format_arg(self, format, warning): # see gh-23830 - msg = r"^Out of bounds nanosecond timestamp: 2417-10-10 00:00:00, at position 0" - with pytest.raises(OutOfBoundsDatetime, match=msg): - to_datetime("2417-10-10 00:00:00", format=format) + if format is None: + res = to_datetime("2417-10-10 00:00:00.00", format=format) + assert isinstance(res, Timestamp) + assert res.year == 2417 + assert res.month == 10 + assert res.day == 10 + else: + msg = "unconverted data remains when parsing with format.*, at position 0" + with pytest.raises(ValueError, match=msg): + to_datetime("2417-10-10 00:00:00.00", format=format) @pytest.mark.parametrize( "arg, origin, expected_str", @@ -3331,7 +3374,7 @@ def test_empty_string_datetime(errors, args, format): # coerce empty string to pd.NaT result = to_datetime(td, format=format, errors=errors) - expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[ns]") + expected = Series(["2016-03-24", "2016-03-25", NaT], dtype="datetime64[s]") tm.assert_series_equal(expected, result) @@ -3371,14 +3414,12 @@ def test_to_datetime_cache_coerce_50_lines_outofbounds(series_length): ) result1 = to_datetime(ser, errors="coerce", utc=True) - expected1 = Series( - [NaT] + ([Timestamp("1991-10-20 00:00:00+00:00")] * series_length) - ) - + expected1 = Series([Timestamp(x) for x in ser]) + assert expected1.dtype == "M8[us, UTC]" tm.assert_series_equal(result1, expected1) - with pytest.raises(OutOfBoundsDatetime, match="Out of bounds nanosecond timestamp"): - to_datetime(ser, errors="raise", utc=True) + result3 = to_datetime(ser, errors="raise", utc=True) + tm.assert_series_equal(result3, expected1) def test_to_datetime_format_f_parse_nanos(): @@ -3463,7 +3504,7 @@ def test_to_datetime_with_empty_str_utc_false_format_mixed(): # GH 50887 vals = ["2020-01-01 00:00+00:00", ""] result = to_datetime(vals, format="mixed") - expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[ns, UTC]") + expected = Index([Timestamp("2020-01-01 00:00+00:00"), "NaT"], dtype="M8[s, UTC]") tm.assert_index_equal(result, expected) # Check that a couple of other similar paths work the same way diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py index ba000a0439dd1..894f49b2fa140 100644 --- a/pandas/tests/tools/test_to_timedelta.py +++ b/pandas/tests/tools/test_to_timedelta.py @@ -29,7 +29,7 @@ def test_to_timedelta_dt64_raises(self): # supported GH#29794 msg = r"dtype datetime64\[ns\] cannot be converted to timedelta64\[ns\]" - ser = Series([pd.NaT]) + ser = Series([pd.NaT], dtype="M8[ns]") with pytest.raises(TypeError, match=msg): to_timedelta(ser) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/tseries/holiday/test_calendar.py b/pandas/tests/tseries/holiday/test_calendar.py index 99829857e6836..90e2e117852a2 100644 --- a/pandas/tests/tseries/holiday/test_calendar.py +++ b/pandas/tests/tseries/holiday/test_calendar.py @@ -57,10 +57,10 @@ def __init__(self, name=None, rules=None) -> None: jan2 = TestCalendar(rules=[Holiday("jan2", year=2015, month=1, day=2)]) # Getting holidays for Jan 1 should not alter results for Jan 2. - expected = DatetimeIndex(["01-Jan-2015"]).as_unit("ns") + expected = DatetimeIndex(["01-Jan-2015"]).as_unit("us") tm.assert_index_equal(jan1.holidays(), expected) - expected2 = DatetimeIndex(["02-Jan-2015"]).as_unit("ns") + expected2 = DatetimeIndex(["02-Jan-2015"]).as_unit("us") tm.assert_index_equal(jan2.holidays(), expected2) diff --git a/pandas/tests/tslibs/test_array_to_datetime.py b/pandas/tests/tslibs/test_array_to_datetime.py index 35b72c9bb2887..3c55ae2c6f904 100644 --- a/pandas/tests/tslibs/test_array_to_datetime.py +++ b/pandas/tests/tslibs/test_array_to_datetime.py @@ -15,7 +15,6 @@ tslib, ) from pandas._libs.tslibs.dtypes import NpyDatetimeUnit -from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas import Timestamp import pandas._testing as tm @@ -156,7 +155,7 @@ def test_parsing_valid_dates(data, expected): arr = np.array(data, dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np.array(expected, dtype="M8[ns]") + expected = np.array(expected, dtype="M8[s]") tm.assert_numpy_array_equal(result, expected) @@ -174,6 +173,8 @@ def test_parsing_timezone_offsets(dt_string, expected_tz): # to the same datetime after the timezone offset is added. arr = np.array(["01-01-2013 00:00:00"], dtype=object) expected, _ = tslib.array_to_datetime(arr) + if "000000000" in dt_string: + expected = expected.astype("M8[ns]") arr = np.array([dt_string], dtype=object) result, result_tz = tslib.array_to_datetime(arr) @@ -206,38 +207,35 @@ def test_parsing_different_timezone_offsets(): @pytest.mark.parametrize( - "invalid_date", + "invalid_date,exp_unit", [ - date(1000, 1, 1), - datetime(1000, 1, 1), - "1000-01-01", - "Jan 1, 1000", - np.datetime64("1000-01-01"), + (date(1000, 1, 1), "s"), + (datetime(1000, 1, 1), "us"), + ("1000-01-01", "s"), + ("Jan 1, 1000", "s"), + (np.datetime64("1000-01-01"), "s"), ], ) @pytest.mark.parametrize("errors", ["coerce", "raise"]) -def test_coerce_outside_ns_bounds(invalid_date, errors): +def test_coerce_outside_ns_bounds(invalid_date, exp_unit, errors): arr = np.array([invalid_date], dtype="object") - kwargs = {"values": arr, "errors": errors} - if errors == "raise": - msg = "^Out of bounds nanosecond timestamp: .*, at position 0$" + result, _ = tslib.array_to_datetime(arr, errors=errors) + out_reso = np.datetime_data(result.dtype)[0] + assert out_reso == exp_unit + ts = Timestamp(invalid_date) + assert ts.unit == exp_unit - with pytest.raises(OutOfBoundsDatetime, match=msg): - tslib.array_to_datetime(**kwargs) - else: # coerce. - result, _ = tslib.array_to_datetime(**kwargs) - expected = np.array([iNaT], dtype="M8[ns]") - - tm.assert_numpy_array_equal(result, expected) + expected = np.array([ts._value], dtype=f"M8[{exp_unit}]") + tm.assert_numpy_array_equal(result, expected) def test_coerce_outside_ns_bounds_one_valid(): arr = np.array(["1/1/1000", "1/1/2000"], dtype=object) result, _ = tslib.array_to_datetime(arr, errors="coerce") - expected = [iNaT, "2000-01-01T00:00:00.000000000"] - expected = np.array(expected, dtype="M8[ns]") + expected = ["1000-01-01T00:00:00.000000000", "2000-01-01T00:00:00.000000000"] + expected = np.array(expected, dtype="M8[s]") tm.assert_numpy_array_equal(result, expected) @@ -247,7 +245,13 @@ def test_coerce_of_invalid_datetimes(): # With coercing, the invalid dates becomes iNaT result, _ = tslib.array_to_datetime(arr, errors="coerce") expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] - tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[ns]")) + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[s]")) + + # With coercing, the invalid dates becomes iNaT + result, _ = tslib.array_to_datetime(arr, errors="coerce") + expected = ["2013-01-01T00:00:00.000000000", iNaT, iNaT] + + tm.assert_numpy_array_equal(result, np.array(expected, dtype="M8[s]")) def test_to_datetime_barely_out_of_bounds(): @@ -292,5 +296,5 @@ def test_datetime_subclass(klass): arr = np.array([klass(2000, 1, 1)], dtype=object) result, _ = tslib.array_to_datetime(arr) - expected = np.array(["2000-01-01T00:00:00.000000000"], dtype="M8[ns]") + expected = np.array(["2000-01-01T00:00:00.000000"], dtype="M8[us]") tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index a54e0071aa006..e654534ccd453 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -260,14 +260,14 @@ def test_categorical_consistency(s1, categorize): tm.assert_series_equal(h1, h3) -def test_categorical_with_nan_consistency(): - c = pd.Categorical.from_codes( - [-1, 0, 1, 2, 3, 4], categories=pd.date_range("2012-01-01", periods=5, name="B") - ) - expected = hash_array(c, categorize=False) - - c = pd.Categorical.from_codes([-1, 0], categories=[pd.Timestamp("2012-01-01")]) - result = hash_array(c, categorize=False) +def test_categorical_with_nan_consistency(unit): + dti = pd.date_range("2012-01-01", periods=5, name="B", unit=unit) + cat = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4], categories=dti) + expected = hash_array(cat, categorize=False) + + ts = pd.Timestamp("2012-01-01").as_unit(unit) + cat2 = pd.Categorical.from_codes([-1, 0], categories=[ts]) + result = hash_array(cat2, categorize=False) assert result[0] in expected assert result[1] in expected