Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: NDArrayBacked in cython #40054

Closed
144 changes: 144 additions & 0 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2639,3 +2639,147 @@ def fast_multiget(dict mapping, ndarray keys, default=np.nan):
output[i] = default

return maybe_convert_objects(output)


cdef class NDArrayBacked:
"""
Implementing these methods in cython improves performance quite a bit.

import pandas as pd

from pandas._libs.lib import NDArrayBacked as cls

dti = pd.date_range("2016-01-01", periods=3)
dta = dti._data
arr = dta._ndarray

obj = cls._simpler_new(arr, arr.dtype)

# for foo in [arr, dta, obj]: ...

%timeit foo.copy()
299 ns ± 30 ns per loop # <-- arr underlying ndarray (for reference)
530 ns ± 9.24 ns per loop # <-- dta with cython NDArrayBacked
1.66 µs ± 46.3 ns per loop # <-- dta without cython NDArrayBacked
328 ns ± 5.29 ns per loop # <-- obj with NDArrayBacked.__cinit__
371 ns ± 6.97 ns per loop # <-- obj with NDArrayBacked._simpler_new

%timeit foo.T
125 ns ± 6.27 ns per loop # <-- arr underlying ndarray (for reference)
226 ns ± 7.66 ns per loop # <-- dta with cython NDArrayBacked
911 ns ± 16.6 ns per loop # <-- dta without cython NDArrayBacked
215 ns ± 4.54 ns per loop # <-- obj with NDArrayBacked._simpler_new

"""
# TODO: implement take in terms of cnp.PyArray_TakeFrom
# TODO: implement concat_same_type in terms of cnp.PyArray_Concatenate

cdef:
readonly ndarray _ndarray
readonly object _dtype

@classmethod
def _simpler_new(cls, ndarray values, object dtype):
# Note: not _simple_new; for unclear reasons, calling this _simple_new
# and trying to call it from the subclass method using super()... fails
cdef:
NDArrayBacked obj
obj = NDArrayBacked.__new__(cls)
obj._ndarray = values
obj._dtype = dtype
return obj

cpdef NDArrayBacked _from_backing_data(self, ndarray values):
# TODO: re-reuse simpler_new if/when it can be cpdef
cdef:
NDArrayBacked obj
obj = NDArrayBacked.__new__(type(self))
obj._ndarray = values
obj._dtype = self._dtype
return obj

cpdef __setstate__(self, state):
if isinstance(state, dict):
if "_data" in state:
data = state.pop("_data")
elif "_ndarray" in state:
data = state.pop("_ndarray")
else:
raise ValueError
self._ndarray = data
self._dtype = state.pop("_dtype")

for key, val in state.items():
setattr(self, key, val)
elif isinstance(state, tuple):
if len(state) != 3:
raise NotImplementedError(state)

data, dtype = state[:2]
if isinstance(dtype, np.ndarray):
dtype, data = data, dtype
self._ndarray = data
self._dtype = dtype

if isinstance(state[2], dict):
for key, val in state[2].items():
setattr(self, key, val)
else:
raise NotImplementedError(state)
else:
raise NotImplementedError(state)

def __len__(self):
return len(self._ndarray)

@property
def shape(self):
# object cast bc _ndarray.shape is npy_intp*
return (<object>(self._ndarray)).shape

@property
def ndim(self) -> int:
return self._ndarray.ndim

@property
def size(self):
return self._ndarray.size

@property
def nbytes(self):
return self._ndarray.nbytes

def copy(self):
# NPY_ANYORDER -> same order as self._ndarray
res_values = cnp.PyArray_NewCopy(self._ndarray, cnp.NPY_ANYORDER)
return self._from_backing_data(res_values)

def delete(self, loc, axis=0):
res_values = np.delete(self._ndarray, loc, axis=axis)
return self._from_backing_data(res_values)

def swapaxes(self, axis1, axis2):
res_values = cnp.PyArray_SwapAxes(self._ndarray, axis1, axis2)
return self._from_backing_data(res_values)

# TODO: pass NPY_MAXDIMS equiv to axis=None?
def repeat(self, repeats, axis: int = 0):
if axis is None:
axis = 0
res_values = cnp.PyArray_Repeat(self._ndarray, repeats, <int>axis)
return self._from_backing_data(res_values)

def reshape(self, *args, **kwargs):
res_values = self._ndarray.reshape(*args, **kwargs)
return self._from_backing_data(res_values)

def ravel(self, order="C"):
# cnp.PyArray_OrderConverter(PyObject* obj, NPY_ORDER* order)
# res_values = cnp.PyArray_Ravel(self._ndarray, order)
res_values = self._ndarray.ravel(order)
return self._from_backing_data(res_values)

@property
def T(self):
res_values = self._ndarray.T
return self._from_backing_data(res_values)
13 changes: 13 additions & 0 deletions pandas/compat/pickle_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,15 @@
)
import warnings

import numpy as np

from pandas._libs.tslibs import BaseOffset

from pandas import Index
from pandas.core.arrays import (
DatetimeArray,
TimedeltaArray,
)

if TYPE_CHECKING:
from pandas import (
Expand Down Expand Up @@ -207,6 +213,13 @@ def load_newobj(self):
# compat
if issubclass(cls, Index):
obj = object.__new__(cls)
elif issubclass(cls, DatetimeArray) and not args:
arr = np.array([], dtype="M8[ns]")
obj = cls.__new__(cls, arr, arr.dtype)
elif issubclass(cls, TimedeltaArray) and not args:
arr = np.array([], dtype="m8[ns]")
obj = cls.__new__(cls, arr, arr.dtype)

else:
obj = cls.__new__(cls, *args)

Expand Down
29 changes: 14 additions & 15 deletions pandas/core/arrays/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,9 +158,6 @@ class DatetimeLikeArrayMixin(OpsMixin, NDArrayBackedExtensionArray):
_recognized_scalars: Tuple[Type, ...]
_ndarray: np.ndarray

def __init__(self, data, dtype: Optional[Dtype] = None, freq=None, copy=False):
raise AbstractMethodError(self)

@classmethod
def _simple_new(
cls: Type[DatetimeLikeArrayT],
Expand All @@ -170,6 +167,11 @@ def _simple_new(
) -> DatetimeLikeArrayT:
raise AbstractMethodError(cls)

def __init__(cls, values, dtype=np.dtype("M8[ns]"), freq=None, copy=False):
# This is just for mypy
# TODO: make default dtype subclass-specific
pass

@property
def _scalar_type(self) -> Type[DatetimeLikeScalar]:
"""
Expand Down Expand Up @@ -254,6 +256,8 @@ def _check_compatible_with(
# NDArrayBackedExtensionArray compat

def __setstate__(self, state):
# TODO: how is NDArrayBacked.__setstate__ getting called? we
# aren't doing super().__setstate__(state) here
if isinstance(state, dict):
if "_data" in state and "_ndarray" not in state:
# backward compat, changed what is property vs attribute
Expand All @@ -272,12 +276,6 @@ def __setstate__(self, state):
def _data(self) -> np.ndarray:
return self._ndarray

def _from_backing_data(
self: DatetimeLikeArrayT, arr: np.ndarray
) -> DatetimeLikeArrayT:
# Note: we do not retain `freq`
return type(self)._simple_new(arr, dtype=self.dtype)

# ------------------------------------------------------------------

def _box_func(self, x):
Expand Down Expand Up @@ -1680,11 +1678,16 @@ def strftime(self, date_format):
"""


class TimelikeOps(DatetimeLikeArrayMixin):
class TimelikeOps(lib.NDArrayBacked, DatetimeLikeArrayMixin):
"""
Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex.
"""

def copy(self):
result = lib.NDArrayBacked.copy(self)
result._freq = self._freq
return result

def _round(self, freq, mode, ambiguous, nonexistent):
# round the local times
if is_datetime64tz_dtype(self.dtype):
Expand Down Expand Up @@ -1769,11 +1772,7 @@ def factorize(self, na_sentinel=-1, sort: bool = False):
uniques = self.copy() # TODO: copy or view?
if sort and self.freq.n < 0:
codes = codes[::-1]
# TODO: overload __getitem__, a slice indexer returns same type as self
# error: Incompatible types in assignment (expression has type
# "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable
# has type "TimelikeOps")
uniques = uniques[::-1] # type: ignore[assignment]
uniques = uniques[::-1]
return codes, uniques
# FIXME: shouldn't get here; we are ignoring sort
return super().factorize(na_sentinel=na_sentinel)
Expand Down
19 changes: 9 additions & 10 deletions pandas/core/arrays/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,13 +238,13 @@ class DatetimeArray(dtl.TimelikeOps, dtl.DatelikeOps):
_dtype: Union[np.dtype, DatetimeTZDtype]
_freq = None

def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False):
def __new__(cls, values, dtype=DT64NS_DTYPE, freq=None, copy=False):
if isinstance(values, (ABCSeries, ABCIndex)):
values = values._values

inferred_freq = getattr(values, "_freq", None)

if isinstance(values, type(self)):
if isinstance(values, cls):
# validation
dtz = getattr(dtype, "tz", None)
if dtz and values.tz is None:
Expand Down Expand Up @@ -303,12 +303,11 @@ def __init__(self, values, dtype=DT64NS_DTYPE, freq=None, copy=False):
# be incorrect(ish?) for the array as a whole
dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz))

self._ndarray = values
self._dtype = dtype
self._freq = freq
obj = cls._simple_new(values, freq=freq, dtype=dtype)

if inferred_freq is None and freq is not None:
type(self)._validate_frequency(self, freq)
cls._validate_frequency(obj, freq)
return obj

@classmethod
def _simple_new(
Expand All @@ -319,10 +318,8 @@ def _simple_new(
assert values.dtype == "i8"
values = values.view(DT64NS_DTYPE)

result = object.__new__(cls)
result._ndarray = values
result = cls._simpler_new(values, dtype)
result._freq = freq
result._dtype = dtype
return result

@classmethod
Expand Down Expand Up @@ -2005,7 +2002,9 @@ def sequence_to_dt64ns(
if is_datetime64tz_dtype(data_dtype):
# DatetimeArray -> ndarray
tz = _maybe_infer_tz(tz, data.tz)
result = data._data
if isinstance(data, ABCIndex):
data = data._data
result = data._ndarray

elif is_datetime64_dtype(data_dtype):
# tz-naive DatetimeArray or ndarray[datetime64]
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,9 @@ def _simple_new(
assert isinstance(values, np.ndarray) and values.dtype == "i8", assertion_msg
return cls(values, freq=freq, dtype=dtype)

def _from_backing_data(self: PeriodArray, arr: np.ndarray) -> PeriodArray:
return type(self)._simple_new(arr, dtype=self.dtype)

@classmethod
def _from_sequence(
cls: Type[PeriodArray],
Expand Down
15 changes: 6 additions & 9 deletions pandas/core/arrays/timedeltas.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,14 +168,14 @@ def dtype(self) -> np.dtype:

_freq = None

def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False):
def __new__(cls, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False):
values = extract_array(values)

inferred_freq = getattr(values, "_freq", None)
explicit_none = freq is None
freq = freq if freq is not lib.no_default else None

if isinstance(values, type(self)):
if isinstance(values, cls):
if explicit_none:
# dont inherit from values
pass
Expand Down Expand Up @@ -216,12 +216,11 @@ def __init__(self, values, dtype=TD64NS_DTYPE, freq=lib.no_default, copy=False):
if freq:
freq = to_offset(freq)

self._ndarray = values
self._dtype = dtype
self._freq = freq
obj = cls._simple_new(values, freq=freq, dtype=dtype)

if inferred_freq is None and freq is not None:
type(self)._validate_frequency(self, freq)
cls._validate_frequency(obj, freq)
return obj

@classmethod
def _simple_new(
Expand All @@ -233,10 +232,8 @@ def _simple_new(
assert values.dtype == "i8"
values = values.view(TD64NS_DTYPE)

result = object.__new__(cls)
result._ndarray = values
result = cls._simpler_new(values, TD64NS_DTYPE)
result._freq = to_offset(freq)
result._dtype = TD64NS_DTYPE
return result

@classmethod
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -1024,7 +1024,10 @@ def astype_dt64_to_dt64tz(

# FIXME: GH#33401 this doesn't match DatetimeArray.astype, which
# goes through the `not via_utc` path
return values.tz_localize("UTC").tz_convert(dtype.tz)
# error: "ExtensionArray" has no attribute "tz_localize"
values = values.tz_localize("UTC") # type:ignore[attr-defined]
# error: "ExtensionArray" has no attribute "tz_convert"
return values.tz_convert(dtype.tz) # type:ignore[attr-defined]

else:
# DatetimeArray/DatetimeIndex.astype behavior
Expand Down
1 change: 1 addition & 0 deletions pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,7 @@ def _get_join_freq(self, other):
def _wrap_joined_index(self, joined: np.ndarray, other):
assert other.dtype == self.dtype, (other.dtype, self.dtype)

joined = joined.view(self._data._ndarray.dtype)
result = super()._wrap_joined_index(joined, other)
result._data._freq = self._get_join_freq(other)
return result
Expand Down
3 changes: 2 additions & 1 deletion pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ def _new_DatetimeIndex(cls, d):
# These are already stored in our DatetimeArray; if they are
# also in the pickle and don't match, we have a problem.
if key in d:
assert d.pop(key) == getattr(dta, key)
val = d.pop(key)
assert val == getattr(dta, key), (key, val, getattr(dta, key))
result = cls._simple_new(dta, **d)
else:
with warnings.catch_warnings():
Expand Down