Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG/API: preserve non-nano in factorize/unique #51978

Merged
merged 3 commits into from
Mar 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,7 @@ Other API changes
- Division by zero with :class:`ArrowDtype` dtypes returns ``-inf``, ``nan``, or ``inf`` depending on the numerator, instead of raising (:issue:`51541`)
- Added :func:`pandas.api.types.is_any_real_numeric_dtype` to check for real numeric dtypes (:issue:`51152`)
- :meth:`~arrays.ArrowExtensionArray.value_counts` now returns data with :class:`ArrowDtype` with ``pyarrow.int64`` type instead of ``"Int64"`` type (:issue:`51462`)
- :func:`factorize` and :func:`unique` preserve the original dtype when passed numpy timedelta64 or datetime64 with non-nanosecond resolution (:issue:`48670`)

.. note::

Expand Down
13 changes: 2 additions & 11 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
from pandas.core.dtypes.cast import (
construct_1d_object_array_from_listlike,
infer_dtype_from_array,
sanitize_to_nanoseconds,
)
from pandas.core.dtypes.common import (
ensure_float64,
Expand All @@ -45,7 +44,6 @@
is_bool_dtype,
is_categorical_dtype,
is_complex_dtype,
is_datetime64_dtype,
is_dict_like,
is_extension_array_dtype,
is_float_dtype,
Expand All @@ -56,7 +54,6 @@
is_object_dtype,
is_scalar,
is_signed_integer_dtype,
is_timedelta64_dtype,
needs_i8_conversion,
)
from pandas.core.dtypes.concat import concat_compat
Expand Down Expand Up @@ -175,8 +172,6 @@ def _ensure_data(values: ArrayLike) -> np.ndarray:

# datetimelike
elif needs_i8_conversion(values.dtype):
if isinstance(values, np.ndarray):
values = sanitize_to_nanoseconds(values)
npvalues = values.view("i8")
npvalues = cast(np.ndarray, npvalues)
return npvalues
Expand Down Expand Up @@ -214,11 +209,6 @@ def _reconstruct_data(
values = cls._from_sequence(values, dtype=dtype)

else:
if is_datetime64_dtype(dtype):
dtype = np.dtype("datetime64[ns]")
elif is_timedelta64_dtype(dtype):
dtype = np.dtype("timedelta64[ns]")

values = values.astype(dtype, copy=False)

return values
Expand Down Expand Up @@ -769,7 +759,8 @@ def factorize(
codes, uniques = values.factorize(sort=sort)
return codes, uniques

elif not isinstance(values.dtype, np.dtype):
elif not isinstance(values, np.ndarray):
# i.e. ExtensionArray
codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel)

else:
Expand Down
20 changes: 0 additions & 20 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
OutOfBoundsTimedelta,
Timedelta,
Timestamp,
astype_overflowsafe,
get_unit_from_dtype,
is_supported_unit,
)
Expand All @@ -42,8 +41,6 @@
)

from pandas.core.dtypes.common import (
DT64NS_DTYPE,
TD64NS_DTYPE,
ensure_int8,
ensure_int16,
ensure_int32,
Expand Down Expand Up @@ -1232,23 +1229,6 @@ def maybe_cast_to_datetime(
return dta


def sanitize_to_nanoseconds(values: np.ndarray, copy: bool = False) -> np.ndarray:
"""
Safely convert non-nanosecond datetime64 or timedelta64 values to nanosecond.
"""
dtype = values.dtype
if dtype.kind == "M" and dtype != DT64NS_DTYPE:
values = astype_overflowsafe(values, dtype=DT64NS_DTYPE)

elif dtype.kind == "m" and dtype != TD64NS_DTYPE:
values = astype_overflowsafe(values, dtype=TD64NS_DTYPE)

elif copy:
values = values.copy()

return values


def _ensure_nanosecond_dtype(dtype: DtypeObj) -> None:
"""
Convert dtypes with granularity less than nanosecond to nanosecond
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/tools/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def _convert_and_box_cache(
"""
from pandas import Series

result = Series(arg).map(cache_array)
result = Series(arg, dtype=cache_array.index.dtype).map(cache_array)
return _box_as_indexlike(result._values, utc=False, name=name)


Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/indexes/datetimes/methods/test_factorize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pytest

from pandas import (
DatetimeIndex,
Expand Down Expand Up @@ -105,3 +106,20 @@ def test_factorize_dst(self, index_or_series):
tm.assert_index_equal(res, idx)
if index_or_series is Index:
assert res.freq == idx.freq

@pytest.mark.parametrize("sort", [True, False])
def test_factorize_no_freq_non_nano(self, tz_naive_fixture, sort):
# GH#51978 case that does not go through the fastpath based on
# non-None freq
tz = tz_naive_fixture
idx = date_range("2016-11-06", freq="H", periods=5, tz=tz)[[0, 4, 1, 3, 2]]
exp_codes, exp_uniques = idx.factorize(sort=sort)

res_codes, res_uniques = idx.as_unit("s").factorize(sort=sort)

tm.assert_numpy_array_equal(res_codes, exp_codes)
tm.assert_index_equal(res_uniques, exp_uniques.as_unit("s"))

res_codes, res_uniques = idx.as_unit("s").to_series().factorize(sort=sort)
tm.assert_numpy_array_equal(res_codes, exp_codes)
tm.assert_index_equal(res_uniques, exp_uniques.as_unit("s"))
4 changes: 2 additions & 2 deletions pandas/tests/io/parser/test_parse_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -1657,8 +1657,8 @@ def date_parser(dt, time):
datetimes = np.array(["2013-11-03T19:00:00"] * 3, dtype="datetime64[s]")
expected = DataFrame(
data={"rxstatus": ["00E80000"] * 3},
index=MultiIndex.from_tuples(
[(datetimes[0], 126), (datetimes[1], 23), (datetimes[2], 13)],
index=MultiIndex.from_arrays(
[datetimes, [126, 23, 13]],
MarcoGorelli marked this conversation as resolved.
Show resolved Hide resolved
names=["datetime", "prn"],
),
)
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,7 @@ def test_object_factorize(self, writable):

def test_datetime64_factorize(self, writable):
# GH35650 Verify whether read-only datetime64 array can be factorized
data = np.array([np.datetime64("2020-01-01T00:00:00.000")])
data = np.array([np.datetime64("2020-01-01T00:00:00.000")], dtype="M8[ns]")
data.setflags(write=writable)
expected_codes = np.array([0], dtype=np.intp)
expected_uniques = np.array(
Expand Down Expand Up @@ -620,13 +620,13 @@ def test_datetime64_dtype_array_returned(self):
def test_datetime_non_ns(self):
a = np.array(["2000", "2000", "2001"], dtype="datetime64[s]")
result = pd.unique(a)
expected = np.array(["2000", "2001"], dtype="datetime64[ns]")
expected = np.array(["2000", "2001"], dtype="datetime64[s]")
tm.assert_numpy_array_equal(result, expected)

def test_timedelta_non_ns(self):
a = np.array(["2000", "2000", "2001"], dtype="timedelta64[s]")
result = pd.unique(a)
expected = np.array([2000000000000, 2001000000000], dtype="timedelta64[ns]")
expected = np.array([2000, 2001], dtype="timedelta64[s]")
tm.assert_numpy_array_equal(result, expected)

def test_timedelta64_dtype_array_returned(self):
Expand Down
34 changes: 21 additions & 13 deletions pandas/tests/tools/test_to_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1076,31 +1076,39 @@ def test_to_datetime_array_of_dt64s(self, cache, unit):
# Assuming all datetimes are in bounds, to_datetime() returns
# an array that is equal to Timestamp() parsing
result = to_datetime(dts, cache=cache)
expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[ns]")
if cache:
# FIXME: behavior should not depend on cache
expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[s]")
else:
expected = DatetimeIndex([Timestamp(x).asm8 for x in dts], dtype="M8[ns]")
mroeschke marked this conversation as resolved.
Show resolved Hide resolved

tm.assert_index_equal(result, expected)

# A list of datetimes where the last one is out of bounds
dts_with_oob = dts + [np.datetime64("9999-01-01")]

msg = "Out of bounds nanosecond timestamp: 9999-01-01 00:00:00"
with pytest.raises(OutOfBoundsDatetime, match=msg):
to_datetime(dts_with_oob, errors="raise")
# As of GH#?? we do not raise in this case
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

have added this to my next CLN branch

to_datetime(dts_with_oob, errors="raise")

tm.assert_index_equal(
to_datetime(dts_with_oob, errors="coerce", cache=cache),
DatetimeIndex(
result = to_datetime(dts_with_oob, errors="coerce", cache=cache)
if not cache:
# FIXME: shouldn't depend on cache!
expected = DatetimeIndex(
[Timestamp(dts_with_oob[0]).asm8, Timestamp(dts_with_oob[1]).asm8] * 30
+ [NaT],
),
)
)
else:
expected = DatetimeIndex(np.array(dts_with_oob, dtype="M8[s]"))
tm.assert_index_equal(result, expected)

# With errors='ignore', out of bounds datetime64s
# are converted to their .item(), which depending on the version of
# numpy is either a python datetime.datetime or datetime.date
tm.assert_index_equal(
to_datetime(dts_with_oob, errors="ignore", cache=cache),
Index(dts_with_oob),
)
result = to_datetime(dts_with_oob, errors="ignore", cache=cache)
if not cache:
# FIXME: shouldn't depend on cache!
expected = Index(dts_with_oob)
tm.assert_index_equal(result, expected)

def test_out_of_bounds_errors_ignore(self):
# https://github.com/pandas-dev/pandas/issues/50587
Expand Down