diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d2aa47f65d263..11b4677562f25 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2639,3 +2639,147 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan): output[i] = default return maybe_convert_objects(output) + + +cdef class NDArrayBacked: + """ + Implementing these methods in cython improves performance quite a bit. + + import pandas as pd + + from pandas._libs.lib import NDArrayBacked as cls + + dti = pd.date_range("2016-01-01", periods=3) + dta = dti._data + arr = dta._ndarray + + obj = cls._simpler_new(arr, arr.dtype) + + # for foo in [arr, dta, obj]: ... + + %timeit foo.copy() + 299 ns ± 30 ns per loop # <-- arr underlying ndarray (for reference) + 530 ns ± 9.24 ns per loop # <-- dta with cython NDArrayBacked + 1.66 µs ± 46.3 ns per loop # <-- dta without cython NDArrayBacked + 328 ns ± 5.29 ns per loop # <-- obj with NDArrayBacked.__cinit__ + 371 ns ± 6.97 ns per loop # <-- obj with NDArrayBacked._simpler_new + + %timeit foo.T + 125 ns ± 6.27 ns per loop # <-- arr underlying ndarray (for reference) + 226 ns ± 7.66 ns per loop # <-- dta with cython NDArrayBacked + 911 ns ± 16.6 ns per loop # <-- dta without cython NDArrayBacked + 215 ns ± 4.54 ns per loop # <-- obj with NDArrayBacked._simpler_new + + """ + # TODO: implement take in terms of cnp.PyArray_TakeFrom + # TODO: implement concat_same_type in terms of cnp.PyArray_Concatenate + + cdef: + readonly ndarray _ndarray + readonly object _dtype + + @classmethod + def _simpler_new(cls, ndarray values, object dtype): + # Note: not _simple_new; for unclear reasons, calling this _simple_new + # and trying to call it from the subclass method using super()... fails + cdef: + NDArrayBacked obj + obj = NDArrayBacked.__new__(cls) + obj._ndarray = values + obj._dtype = dtype + return obj + + cpdef NDArrayBacked _from_backing_data(self, ndarray values): + # TODO: re-reuse simpler_new if/when it can be cpdef + cdef: + NDArrayBacked obj + obj = NDArrayBacked.__new__(type(self)) + obj._ndarray = values + obj._dtype = self._dtype + return obj + + cpdef __setstate__(self, state): + if isinstance(state, dict): + if "_data" in state: + data = state.pop("_data") + elif "_ndarray" in state: + data = state.pop("_ndarray") + else: + raise ValueError + self._ndarray = data + self._dtype = state.pop("_dtype") + + for key, val in state.items(): + setattr(self, key, val) + elif isinstance(state, tuple): + if len(state) != 3: + raise NotImplementedError(state) + + data, dtype = state[:2] + if isinstance(dtype, np.ndarray): + dtype, data = data, dtype + self._ndarray = data + self._dtype = dtype + + if isinstance(state[2], dict): + for key, val in state[2].items(): + setattr(self, key, val) + else: + raise NotImplementedError(state) + else: + raise NotImplementedError(state) + + def __len__(self): + return len(self._ndarray) + + @property + def shape(self): + # object cast bc _ndarray.shape is npy_intp* + return ((self._ndarray)).shape + + @property + def ndim(self) -> int: + return self._ndarray.ndim + + @property + def size(self): + return self._ndarray.size + + @property + def nbytes(self): + return self._ndarray.nbytes + + def copy(self): + # NPY_ANYORDER -> same order as self._ndarray + res_values = cnp.PyArray_NewCopy(self._ndarray, cnp.NPY_ANYORDER) + return self._from_backing_data(res_values) + + def delete(self, loc, axis=0): + res_values = np.delete(self._ndarray, loc, axis=axis) + return self._from_backing_data(res_values) + + def swapaxes(self, axis1, axis2): + res_values = cnp.PyArray_SwapAxes(self._ndarray, axis1, axis2) + return self._from_backing_data(res_values) + + # TODO: pass NPY_MAXDIMS equiv to axis=None? + def repeat(self, repeats, axis: int = 0): + if axis is None: + axis = 0 + res_values = cnp.PyArray_Repeat(self._ndarray, repeats, axis) + return self._from_backing_data(res_values) + + def reshape(self, *args, **kwargs): + res_values = self._ndarray.reshape(*args, **kwargs) + return self._from_backing_data(res_values) + + def ravel(self, order="C"): + # cnp.PyArray_OrderConverter(PyObject* obj, NPY_ORDER* order) + # res_values = cnp.PyArray_Ravel(self._ndarray, order) + res_values = self._ndarray.ravel(order) + return self._from_backing_data(res_values) + + @property + def T(self): + res_values = self._ndarray.T + return self._from_backing_data(res_values) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 9d48035213126..852e8de816a4f 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -13,9 +13,15 @@ ) import warnings +import numpy as np + from pandas._libs.tslibs import BaseOffset from pandas import Index +from pandas.core.arrays import ( + DatetimeArray, + TimedeltaArray, +) if TYPE_CHECKING: from pandas import ( @@ -207,6 +213,13 @@ def load_newobj(self): # compat if issubclass(cls, Index): obj = object.__new__(cls) + elif issubclass(cls, DatetimeArray) and not args: + arr = np.array([], dtype="M8[ns]") + obj = cls.__new__(cls, arr, arr.dtype) + elif issubclass(cls, TimedeltaArray) and not args: + arr = np.array([], dtype="m8[ns]") + obj = cls.__new__(cls, arr, arr.dtype) + else: obj = cls.__new__(cls, *args) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index e476c3566c10f..bf20769a5a902 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -158,9 +158,6 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray): _recognized_scalars: Tuple[Type, ...] _ndarray: np.ndarray - def __init__(self, data, dtype: Optional[Dtype] = None, freq=None, copy=False): - raise AbstractMethodError(self) - @classmethod def _simple_new( cls: Type[DatetimeLikeArrayT], @@ -170,6 +167,11 @@ def _simple_new( ) -> DatetimeLikeArrayT: raise AbstractMethodError(cls) + def __init__(cls, values, dtype=np.dtype("M8[ns]"), freq=None, copy=False): + # This is just for mypy + # TODO: make default dtype subclass-specific + pass + @property def _scalar_type(self) -> Type[DatetimeLikeScalar]: """ @@ -254,6 +256,8 @@ def _check_compatible_with( # NDArrayBackedExtensionArray compat def __setstate__(self, state): + # TODO: how is NDArrayBacked.__setstate__ getting called? we + # aren't doing super().__setstate__(state) here if isinstance(state, dict): if "_data" in state and "_ndarray" not in state: # backward compat, changed what is property vs attribute @@ -272,12 +276,6 @@ def __setstate__(self, state): def _data(self) -> np.ndarray: return self._ndarray - def _from_backing_data( - self: DatetimeLikeArrayT, arr: np.ndarray - ) -> DatetimeLikeArrayT: - # Note: we do not retain `freq` - return type(self)._simple_new(arr, dtype=self.dtype) - # ------------------------------------------------------------------ def _box_func(self, x): @@ -1680,11 +1678,16 @@ def strftime(self, date_format): """ -class TimelikeOps(DatetimeLikeArrayMixin): +class TimelikeOps(lib.NDArrayBacked, DatetimeLikeArrayMixin): """ Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. """ + def copy(self): + result = lib.NDArrayBacked.copy(self) + result._freq = self._freq + return result + def _round(self, freq, mode, ambiguous, nonexistent): # round the local times if is_datetime64tz_dtype(self.dtype): @@ -1769,11 +1772,7 @@ def factorize(self, na_sentinel=-1, sort: bool = False): uniques = self.copy() # TODO: copy or view? if sort and self.freq.n < 0: codes = codes[::-1] - # TODO: overload __getitem__, a slice indexer returns same type as self - # error: Incompatible types in assignment (expression has type - # "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable - # has type "TimelikeOps") - uniques = uniques[::-1] # type: ignore[assignment] + uniques = uniques[::-1] return codes, uniques # FIXME: shouldn't get here; we are ignoring sort return super().factorize(na_sentinel=na_sentinel) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 28e469547fe62..ddc39e76113b3 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -238,13 +238,13 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps): _dtype: Union[np.dtype, DatetimeTZDtype] _freq = None - def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False): + def __new__(cls, values, dtype=DT64NS_DTYPE, freq=None, copy=False): if isinstance(values, (ABCSeries, ABCIndex)): values = values._values inferred_freq = getattr(values, "_freq", None) - if isinstance(values, type(self)): + if isinstance(values, cls): # validation dtz = getattr(dtype, "tz", None) if dtz and values.tz is None: @@ -303,12 +303,11 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False): # be incorrect(ish?) for the array as a whole dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz)) - self._ndarray = values - self._dtype = dtype - self._freq = freq + obj = cls._simple_new(values, freq=freq, dtype=dtype) if inferred_freq is None and freq is not None: - type(self)._validate_frequency(self, freq) + cls._validate_frequency(obj, freq) + return obj @classmethod def _simple_new( @@ -319,10 +318,8 @@ def _simple_new( assert values.dtype == "i8" values = values.view(DT64NS_DTYPE) - result = object.__new__(cls) - result._ndarray = values + result = cls._simpler_new(values, dtype) result._freq = freq - result._dtype = dtype return result @classmethod @@ -2005,7 +2002,9 @@ def sequence_to_dt64ns( if is_datetime64tz_dtype(data_dtype): # DatetimeArray -> ndarray tz = _maybe_infer_tz(tz, data.tz) - result = data._data + if isinstance(data, ABCIndex): + data = data._data + result = data._ndarray elif is_datetime64_dtype(data_dtype): # tz-naive DatetimeArray or ndarray[datetime64] diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 96a159c0804c9..d14da71a26b7b 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -223,6 +223,9 @@ def _simple_new( assert isinstance(values, np.ndarray) and values.dtype == "i8", assertion_msg return cls(values, freq=freq, dtype=dtype) + def _from_backing_data(self: PeriodArray, arr: np.ndarray) -> PeriodArray: + return type(self)._simple_new(arr, dtype=self.dtype) + @classmethod def _from_sequence( cls: Type[PeriodArray], diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index f7af1bb3da86b..b8e1cbb1d9b72 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -168,14 +168,14 @@ def dtype(self) -> np.dtype: _freq = None - def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False): + def __new__(cls, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False): values = extract_array(values) inferred_freq = getattr(values, "_freq", None) explicit_none = freq is None freq = freq if freq is not lib.no_default else None - if isinstance(values, type(self)): + if isinstance(values, cls): if explicit_none: # dont inherit from values pass @@ -216,12 +216,11 @@ def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False): if freq: freq = to_offset(freq) - self._ndarray = values - self._dtype = dtype - self._freq = freq + obj = cls._simple_new(values, freq=freq, dtype=dtype) if inferred_freq is None and freq is not None: - type(self)._validate_frequency(self, freq) + cls._validate_frequency(obj, freq) + return obj @classmethod def _simple_new( @@ -233,10 +232,8 @@ def _simple_new( assert values.dtype == "i8" values = values.view(TD64NS_DTYPE) - result = object.__new__(cls) - result._ndarray = values + result = cls._simpler_new(values, TD64NS_DTYPE) result._freq = to_offset(freq) - result._dtype = TD64NS_DTYPE return result @classmethod diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b30dbe32eec4b..27c338a745b50 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1024,7 +1024,10 @@ def astype_dt64_to_dt64tz( # FIXME: GH#33401 this doesn't match DatetimeArray.astype, which # goes through the `not via_utc` path - return values.tz_localize("UTC").tz_convert(dtype.tz) + # error: "ExtensionArray" has no attribute "tz_localize" + values = values.tz_localize("UTC") # type:ignore[attr-defined] + # error: "ExtensionArray" has no attribute "tz_convert" + return values.tz_convert(dtype.tz) # type:ignore[attr-defined] else: # DatetimeArray/DatetimeIndex.astype behavior diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 6d5992540ef49..be85839bf93c4 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -651,6 +651,7 @@ def _get_join_freq(self, other): def _wrap_joined_index(self, joined: np.ndarray, other): assert other.dtype == self.dtype, (other.dtype, self.dtype) + joined = joined.view(self._data._ndarray.dtype) result = super()._wrap_joined_index(joined, other) result._data._freq = self._get_join_freq(other) return result diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 9ea43d083f5b3..cb700e5ac05e7 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -93,7 +93,8 @@ def _new_DatetimeIndex(cls, d): # These are already stored in our DatetimeArray; if they are # also in the pickle and don't match, we have a problem. if key in d: - assert d.pop(key) == getattr(dta, key) + val = d.pop(key) + assert val == getattr(dta, key), (key, val, getattr(dta, key)) result = cls._simple_new(dta, **d) else: with warnings.catch_warnings():