pandas-dev · jbrockmendel · Feb 23, 2021 · Feb 24, 2021 · Feb 25, 2021 · Feb 25, 2021
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2639,3 +2639,147 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan):
             output[i] = default
 
     return maybe_convert_objects(output)
+
+
+cdef class NDArrayBacked:
+    """
+    Implementing these methods in cython improves performance quite a bit.
+
+    import pandas as pd
+
+    from pandas._libs.lib import NDArrayBacked as cls
+
+    dti = pd.date_range("2016-01-01", periods=3)
+    dta = dti._data
+    arr = dta._ndarray
+
+    obj = cls._simpler_new(arr, arr.dtype)
+
+    # for foo in [arr, dta, obj]: ...
+
+    %timeit foo.copy()
+    299 ns ± 30 ns per loop     # <-- arr underlying ndarray (for reference)
+    530 ns ± 9.24 ns per loop   # <-- dta with cython NDArrayBacked
+    1.66 µs ± 46.3 ns per loop  # <-- dta without cython NDArrayBacked
+    328 ns ± 5.29 ns per loop   # <-- obj with NDArrayBacked.__cinit__
+    371 ns ± 6.97 ns per loop   # <-- obj with NDArrayBacked._simpler_new
+
+    %timeit foo.T
+    125 ns ± 6.27 ns per loop   # <-- arr underlying ndarray (for reference)
+    226 ns ± 7.66 ns per loop   # <-- dta with cython NDArrayBacked
+    911 ns ± 16.6 ns per loop   # <-- dta without cython NDArrayBacked
+    215 ns ± 4.54 ns per loop   # <-- obj with NDArrayBacked._simpler_new
+
+    """
+    # TODO: implement take in terms of cnp.PyArray_TakeFrom
+    # TODO: implement concat_same_type in terms of cnp.PyArray_Concatenate
+
+    cdef:
+        readonly ndarray _ndarray
+        readonly object _dtype
+
+    @classmethod
+    def _simpler_new(cls, ndarray values, object dtype):
+        # Note: not _simple_new; for unclear reasons, calling this _simple_new
+        #  and trying to call it from the subclass method using super()... fails
+        cdef:
+            NDArrayBacked obj
+        obj = NDArrayBacked.__new__(cls)
+        obj._ndarray = values
+        obj._dtype = dtype
+        return obj
+
+    cpdef NDArrayBacked _from_backing_data(self, ndarray values):
+        # TODO: re-reuse simpler_new if/when it can be cpdef
+        cdef:
+            NDArrayBacked obj
+        obj = NDArrayBacked.__new__(type(self))
+        obj._ndarray = values
+        obj._dtype = self._dtype
+        return obj
+
+    cpdef __setstate__(self, state):
+        if isinstance(state, dict):
+            if "_data" in state:
+                data = state.pop("_data")
+            elif "_ndarray" in state:
+                data = state.pop("_ndarray")
+            else:
+                raise ValueError
+            self._ndarray = data
+            self._dtype = state.pop("_dtype")
+
+            for key, val in state.items():
+                setattr(self, key, val)
+        elif isinstance(state, tuple):
+            if len(state) != 3:
+                raise NotImplementedError(state)
+
+            data, dtype = state[:2]
+            if isinstance(dtype, np.ndarray):
+                dtype, data = data, dtype
+            self._ndarray = data
+            self._dtype = dtype
+
+            if isinstance(state[2], dict):
+                for key, val in state[2].items():
+                    setattr(self, key, val)
+            else:
+                raise NotImplementedError(state)
+        else:
+            raise NotImplementedError(state)
+
+    def __len__(self):
+        return len(self._ndarray)
+
+    @property
+    def shape(self):
+        # object cast bc _ndarray.shape is npy_intp*
+        return (<object>(self._ndarray)).shape
+
+    @property
+    def ndim(self) -> int:
+        return self._ndarray.ndim
+
+    @property
+    def size(self):
+        return self._ndarray.size
+
+    @property
+    def nbytes(self):
+        return self._ndarray.nbytes
+
+    def copy(self):
+        # NPY_ANYORDER -> same order as self._ndarray
+        res_values = cnp.PyArray_NewCopy(self._ndarray, cnp.NPY_ANYORDER)
+        return self._from_backing_data(res_values)
+
+    def delete(self, loc, axis=0):
+        res_values = np.delete(self._ndarray, loc, axis=axis)
+        return self._from_backing_data(res_values)
+
+    def swapaxes(self, axis1, axis2):
+        res_values = cnp.PyArray_SwapAxes(self._ndarray, axis1, axis2)
+        return self._from_backing_data(res_values)
+
+    # TODO: pass NPY_MAXDIMS equiv to axis=None?
+    def repeat(self, repeats, axis: int = 0):
+        if axis is None:
+            axis = 0
+        res_values = cnp.PyArray_Repeat(self._ndarray, repeats, <int>axis)
+        return self._from_backing_data(res_values)
+
+    def reshape(self, *args, **kwargs):
+        res_values = self._ndarray.reshape(*args, **kwargs)
+        return self._from_backing_data(res_values)
+
+    def ravel(self, order="C"):
+        # cnp.PyArray_OrderConverter(PyObject* obj, NPY_ORDER* order)
+        # res_values = cnp.PyArray_Ravel(self._ndarray, order)
+        res_values = self._ndarray.ravel(order)
+        return self._from_backing_data(res_values)
+
+    @property
+    def T(self):
+        res_values = self._ndarray.T
+        return self._from_backing_data(res_values)
diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py
@@ -13,9 +13,15 @@
 )
 import warnings
 
+import numpy as np
+
 from pandas._libs.tslibs import BaseOffset
 
 from pandas import Index
+from pandas.core.arrays import (
+    DatetimeArray,
+    TimedeltaArray,
+)
 
 if TYPE_CHECKING:
     from pandas import (
@@ -207,6 +213,13 @@ def load_newobj(self):
     # compat
     if issubclass(cls, Index):
         obj = object.__new__(cls)
+    elif issubclass(cls, DatetimeArray) and not args:
+        arr = np.array([], dtype="M8[ns]")
+        obj = cls.__new__(cls, arr, arr.dtype)
+    elif issubclass(cls, TimedeltaArray) and not args:
+        arr = np.array([], dtype="m8[ns]")
+        obj = cls.__new__(cls, arr, arr.dtype)
+
     else:
         obj = cls.__new__(cls, *args)
 

diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -158,9 +158,6 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray):
     _recognized_scalars: Tuple[Type, ...]
     _ndarray: np.ndarray
 
-    def __init__(self, data, dtype: Optional[Dtype] = None, freq=None, copy=False):
-        raise AbstractMethodError(self)
-
     @classmethod
     def _simple_new(
         cls: Type[DatetimeLikeArrayT],
@@ -170,6 +167,11 @@ def _simple_new(
     ) -> DatetimeLikeArrayT:
         raise AbstractMethodError(cls)
 
+    def __init__(cls, values, dtype=np.dtype("M8[ns]"), freq=None, copy=False):
+        # This is just for mypy
+        # TODO: make default dtype subclass-specific
+        pass
+
     @property
     def _scalar_type(self) -> Type[DatetimeLikeScalar]:
         """
@@ -254,6 +256,8 @@ def _check_compatible_with(
     # NDArrayBackedExtensionArray compat
 
     def __setstate__(self, state):
+        # TODO: how is NDArrayBacked.__setstate__ getting called?  we
+        #  aren't doing super().__setstate__(state) here
         if isinstance(state, dict):
             if "_data" in state and "_ndarray" not in state:
                 # backward compat, changed what is property vs attribute
@@ -272,12 +276,6 @@ def __setstate__(self, state):
     def _data(self) -> np.ndarray:
         return self._ndarray
 
-    def _from_backing_data(
-        self: DatetimeLikeArrayT, arr: np.ndarray
-    ) -> DatetimeLikeArrayT:
-        # Note: we do not retain `freq`
-        return type(self)._simple_new(arr, dtype=self.dtype)
-
     # ------------------------------------------------------------------
 
     def _box_func(self, x):
@@ -1680,11 +1678,16 @@ def strftime(self, date_format):
     """
 
 
-class TimelikeOps(DatetimeLikeArrayMixin):
+class TimelikeOps(lib.NDArrayBacked, DatetimeLikeArrayMixin):
     """
     Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex.
     """
 
+    def copy(self):
+        result = lib.NDArrayBacked.copy(self)
+        result._freq = self._freq
+        return result
+
     def _round(self, freq, mode, ambiguous, nonexistent):
         # round the local times
         if is_datetime64tz_dtype(self.dtype):
@@ -1769,11 +1772,7 @@ def factorize(self, na_sentinel=-1, sort: bool = False):
             uniques = self.copy()  # TODO: copy or view?
             if sort and self.freq.n < 0:
                 codes = codes[::-1]
-                # TODO: overload __getitem__, a slice indexer returns same type as self
-                # error: Incompatible types in assignment (expression has type
-                # "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable
-                # has type "TimelikeOps")
-                uniques = uniques[::-1]  # type: ignore[assignment]
+                uniques = uniques[::-1]
             return codes, uniques
         # FIXME: shouldn't get here; we are ignoring sort
         return super().factorize(na_sentinel=na_sentinel)

diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py
@@ -238,13 +238,13 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):
     _dtype: Union[np.dtype, DatetimeTZDtype]
     _freq = None
 
-    def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False):
+    def __new__(cls, values, dtype=DT64NS_DTYPE, freq=None, copy=False):
         if isinstance(values, (ABCSeries, ABCIndex)):
             values = values._values
 
         inferred_freq = getattr(values, "_freq", None)
 
-        if isinstance(values, type(self)):
+        if isinstance(values, cls):
             # validation
             dtz = getattr(dtype, "tz", None)
             if dtz and values.tz is None:
@@ -303,12 +303,11 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False):
             # be incorrect(ish?) for the array as a whole
             dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz))
 
-        self._ndarray = values
-        self._dtype = dtype
-        self._freq = freq
+        obj = cls._simple_new(values, freq=freq, dtype=dtype)
 
         if inferred_freq is None and freq is not None:
-            type(self)._validate_frequency(self, freq)
+            cls._validate_frequency(obj, freq)
+        return obj
 
     @classmethod
     def _simple_new(
@@ -319,10 +318,8 @@ def _simple_new(
             assert values.dtype == "i8"
             values = values.view(DT64NS_DTYPE)
 
-        result = object.__new__(cls)
-        result._ndarray = values
+        result = cls._simpler_new(values, dtype)
         result._freq = freq
-        result._dtype = dtype
         return result
 
     @classmethod
@@ -2005,7 +2002,9 @@ def sequence_to_dt64ns(
     if is_datetime64tz_dtype(data_dtype):
         # DatetimeArray -> ndarray
         tz = _maybe_infer_tz(tz, data.tz)
-        result = data._data
+        if isinstance(data, ABCIndex):
+            data = data._data
+        result = data._ndarray
 
     elif is_datetime64_dtype(data_dtype):
         # tz-naive DatetimeArray or ndarray[datetime64]

diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -223,6 +223,9 @@ def _simple_new(
         assert isinstance(values, np.ndarray) and values.dtype == "i8", assertion_msg
         return cls(values, freq=freq, dtype=dtype)
 
+    def _from_backing_data(self: PeriodArray, arr: np.ndarray) -> PeriodArray:
+        return type(self)._simple_new(arr, dtype=self.dtype)
+
     @classmethod
     def _from_sequence(
         cls: Type[PeriodArray],

diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py
@@ -168,14 +168,14 @@ def dtype(self) -> np.dtype:
 
     _freq = None
 
-    def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False):
+    def __new__(cls, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False):
         values = extract_array(values)
 
         inferred_freq = getattr(values, "_freq", None)
         explicit_none = freq is None
         freq = freq if freq is not lib.no_default else None
 
-        if isinstance(values, type(self)):
+        if isinstance(values, cls):
             if explicit_none:
                 # dont inherit from values
                 pass
@@ -216,12 +216,11 @@ def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False):
         if freq:
             freq = to_offset(freq)
 
-        self._ndarray = values
-        self._dtype = dtype
-        self._freq = freq
+        obj = cls._simple_new(values, freq=freq, dtype=dtype)
 
         if inferred_freq is None and freq is not None:
-            type(self)._validate_frequency(self, freq)
+            cls._validate_frequency(obj, freq)
+        return obj
 
     @classmethod
     def _simple_new(
@@ -233,10 +232,8 @@ def _simple_new(
             assert values.dtype == "i8"
             values = values.view(TD64NS_DTYPE)
 
-        result = object.__new__(cls)
-        result._ndarray = values
+        result = cls._simpler_new(values, TD64NS_DTYPE)
         result._freq = to_offset(freq)
-        result._dtype = TD64NS_DTYPE
         return result
 
     @classmethod

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -1024,7 +1024,10 @@ def astype_dt64_to_dt64tz(
 
         # FIXME: GH#33401 this doesn't match DatetimeArray.astype, which
         #  goes through the `not via_utc` path
-        return values.tz_localize("UTC").tz_convert(dtype.tz)
+        # error: "ExtensionArray" has no attribute "tz_localize"
+        values = values.tz_localize("UTC")  # type:ignore[attr-defined]
+        # error: "ExtensionArray" has no attribute "tz_convert"
+        return values.tz_convert(dtype.tz)  # type:ignore[attr-defined]
 
     else:
         # DatetimeArray/DatetimeIndex.astype behavior

diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -651,6 +651,7 @@ def _get_join_freq(self, other):
     def _wrap_joined_index(self, joined: np.ndarray, other):
         assert other.dtype == self.dtype, (other.dtype, self.dtype)
 
+        joined = joined.view(self._data._ndarray.dtype)
         result = super()._wrap_joined_index(joined, other)
         result._data._freq = self._get_join_freq(other)
         return result

diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py
@@ -93,7 +93,8 @@ def _new_DatetimeIndex(cls, d):
                 # These are already stored in our DatetimeArray; if they are
                 #  also in the pickle and don't match, we have a problem.
                 if key in d:
-                    assert d.pop(key) == getattr(dta, key)
+                    val = d.pop(key)
+                    assert val == getattr(dta, key), (key, val, getattr(dta, key))
         result = cls._simple_new(dta, **d)
     else:
         with warnings.catch_warnings():