From 13bd4480d9b7b8320ae308ee06750cc3727c64fe Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 16 Feb 2021 18:23:35 -0800 Subject: [PATCH 01/61] REF/POC: back DTBlock/TDBlock directly by DTA/TDA --- pandas/core/algorithms.py | 6 + pandas/core/arrays/_mixins.py | 12 +- pandas/core/arrays/base.py | 8 +- pandas/core/arrays/datetimes.py | 4 + pandas/core/dtypes/common.py | 25 ++ pandas/core/dtypes/concat.py | 16 +- pandas/core/frame.py | 30 ++- pandas/core/groupby/generic.py | 8 +- pandas/core/groupby/ops.py | 5 +- pandas/core/indexes/extension.py | 4 + pandas/core/internals/blocks.py | 264 +++++++++---------- pandas/core/internals/concat.py | 43 +-- pandas/core/internals/construction.py | 42 ++- pandas/core/internals/managers.py | 32 ++- pandas/core/internals/ops.py | 6 +- pandas/core/missing.py | 6 +- pandas/core/sorting.py | 13 +- pandas/io/pytables.py | 6 +- pandas/tests/arithmetic/test_datetime64.py | 18 +- pandas/tests/frame/methods/test_astype.py | 12 +- pandas/tests/frame/methods/test_set_index.py | 3 +- pandas/tests/frame/test_block_internals.py | 2 +- pandas/tests/groupby/test_apply.py | 1 - pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/internals/test_internals.py | 14 +- pandas/tests/io/pytables/test_timezones.py | 3 + pandas/tests/series/methods/test_isin.py | 2 +- pandas/tests/series/test_constructors.py | 4 +- 28 files changed, 343 insertions(+), 248 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 40533cdd554b3..792b4a91754bc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -63,6 +63,7 @@ is_period_dtype, is_scalar, is_signed_integer_dtype, + is_strict_ea, is_timedelta64_dtype, is_unsigned_integer_dtype, needs_i8_conversion, @@ -1779,6 +1780,11 @@ def take_nd( if isinstance(arr, ABCExtensionArray): # Check for EA to catch DatetimeArray, TimedeltaArray + if not is_strict_ea(arr): + # i.e. DatetimeArray, TimedeltaArray + return arr.take( + indexer, axis=axis, fill_value=fill_value, allow_fill=allow_fill + ) return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) arr = extract_array(arr) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 825757ddffee4..6c1f8049f6d20 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -171,6 +171,10 @@ def T(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: new_data = self._ndarray.T return self._from_backing_data(new_data) + def swapaxes(self, axis1, axis2): + res_values = self._ndarray.swapaxes(axis1, axis2) + return self._from_backing_data(res_values) + # ------------------------------------------------------------------------ def equals(self, other) -> bool: @@ -287,7 +291,7 @@ def fillna( if mask.any(): if method is not None: - func = missing.get_fill_func(method) + func = missing.get_fill_func(method, ndim=self.ndim) new_values = func(self._ndarray.copy(), limit=limit, mask=mask) # TODO: PandasArray didn't used to copy, need tests for this new_values = self._from_backing_data(new_values) @@ -378,8 +382,10 @@ def where( res_values = np.where(mask, self._ndarray, value) return self._from_backing_data(res_values) - def delete(self: NDArrayBackedExtensionArrayT, loc) -> NDArrayBackedExtensionArrayT: - res_values = np.delete(self._ndarray, loc) + def delete( + self: NDArrayBackedExtensionArrayT, loc, axis: int = 0 + ) -> NDArrayBackedExtensionArrayT: + res_values = np.delete(self._ndarray, loc, axis=axis) return self._from_backing_data(res_values) # ------------------------------------------------------------------------ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index edc8fa14ca142..36ec6b61de9ac 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -618,7 +618,7 @@ def argsort( mask=np.asarray(self.isna()), ) - def argmin(self, skipna: bool = True) -> int: + def argmin(self, axis: int = 0, skipna: bool = True): """ Return the index of minimum value. @@ -640,9 +640,9 @@ def argmin(self, skipna: bool = True) -> int: validate_bool_kwarg(skipna, "skipna") if not skipna and self.isna().any(): raise NotImplementedError - return nargminmax(self, "argmin") + return nargminmax(self, "argmin", axis=axis) - def argmax(self, skipna: bool = True) -> int: + def argmax(self, axis: int = 0, skipna: bool = True): """ Return the index of maximum value. @@ -664,7 +664,7 @@ def argmax(self, skipna: bool = True) -> int: validate_bool_kwarg(skipna, "skipna") if not skipna and self.isna().any(): raise NotImplementedError - return nargminmax(self, "argmax") + return nargminmax(self, "argmax", axis=axis) def fillna(self, value=None, method=None, limit=None): """ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 05184ea02e7a2..99e1f3f50b3d1 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -616,6 +616,10 @@ def astype(self, dtype, copy=True): elif is_datetime64_ns_dtype(dtype): return astype_dt64_to_dt64tz(self, dtype, copy, via_utc=False) + elif self.tz is None and is_datetime64_dtype(dtype) and dtype != self.dtype: + # unit conversion + return self._data.astype(dtype) + elif is_period_dtype(dtype): return self.to_period(freq=dtype.freq) return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 0966d0b93cc25..48a3d382e6039 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1495,6 +1495,31 @@ def is_extension_type(arr) -> bool: return False +def is_strict_ea(obj): + """ + ExtensionArray that does not support 2D, or more specifically that does + not use HybridBlock. + """ + from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + TimedeltaArray, + ) + + return isinstance(obj, ExtensionArray) and not isinstance( + obj, (DatetimeArray, TimedeltaArray) + ) + + +def is_ea_dtype(dtype) -> bool: + """ + Analogue to is_extension_array_dtype but excluding DatetimeTZDtype. + """ + # Note: if other EA dtypes are ever held in HybridBlock, exclude those + # here too. + return is_extension_array_dtype(dtype) and not is_datetime64tz_dtype(dtype) + + def is_extension_array_dtype(arr_or_dtype) -> bool: """ Check if an object is a pandas extension array type. diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 42ac786ff315e..1e8f7f06ed24c 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -108,6 +108,7 @@ def is_nonempty(x) -> bool: to_concat = non_empties kinds = {obj.dtype.kind for obj in to_concat} + _contains_datetime = any(kind in ["m", "M"] for kind in kinds) all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 @@ -122,11 +123,13 @@ def is_nonempty(x) -> bool: if isinstance(to_concat[0], ExtensionArray): cls = type(to_concat[0]) + if _contains_datetime: + return _concat_datetime(to_concat, axis=axis) return cls._concat_same_type(to_concat) else: - return np.concatenate(to_concat) + return np.concatenate(to_concat, axis=axis) - elif any(kind in ["m", "M"] for kind in kinds): + elif _contains_datetime: return _concat_datetime(to_concat, axis=axis) elif all_empty: @@ -344,14 +347,5 @@ def _concat_datetime(to_concat, axis=0): # in Timestamp/Timedelta return _concatenate_2d([x.astype(object) for x in to_concat], axis=axis) - if axis == 1: - # TODO(EA2D): kludge not necessary with 2D EAs - to_concat = [x.reshape(1, -1) if x.ndim == 1 else x for x in to_concat] - result = type(to_concat[0])._concat_same_type(to_concat, axis=axis) - - if result.ndim == 2 and is_extension_array_dtype(result.dtype): - # TODO(EA2D): kludge not necessary with 2D EAs - assert result.shape[0] == 1 - result = result[0] return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 060652c9f94ae..492e308710793 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -110,6 +110,7 @@ is_datetime64_any_dtype, is_dict_like, is_dtype_equal, + is_ea_dtype, is_extension_array_dtype, is_float, is_float_dtype, @@ -121,6 +122,7 @@ is_object_dtype, is_scalar, is_sequence, + is_strict_ea, pandas_dtype, ) from pandas.core.dtypes.missing import ( @@ -752,10 +754,13 @@ def _can_fast_transpose(self) -> bool: """ if isinstance(self._mgr, ArrayManager): return False - if self._mgr.any_extension_types: - # TODO(EA2D) special case would be unnecessary with 2D EAs + blocks = self._mgr.blocks + if len(blocks) != 1: return False - return len(self._mgr.blocks) == 1 + + dtype = blocks[0].dtype + # TODO(EA2D) special case would be unnecessary with 2D EAs + return not is_ea_dtype(dtype) # ---------------------------------------------------------------------- # Rendering Methods @@ -2981,9 +2986,23 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: # construct the args dtypes = list(self.dtypes) - if self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]): + if self._mgr.nblocks == 1 and not is_strict_ea(self._mgr.blocks[0].values): + # Note: tests pass without this, but this improves perf quite a bit. + # TODO: something like frame.values but that _may_ give an EA + blk = self._mgr.blocks[0] + new_values = blk.values + if copy: + new_values = new_values.copy() + result = self._constructor( + new_values, index=self.columns, columns=self.index + ) + + elif ( + self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]) + ): # We have EAs with the same dtype. We can preserve that dtype in transpose. dtype = dtypes[0] + arr_type = dtype.construct_array_type() values = self.values @@ -2991,6 +3010,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: result = self._constructor( dict(zip(self.index, new_values)), index=self.columns ) + # TODO: what if index is non-unique? (not specific to EA2D) else: new_values = self.values.T @@ -9021,6 +9041,8 @@ def func(values: np.ndarray): def blk_func(values, axis=1): if isinstance(values, ExtensionArray): + if not is_strict_ea(values): + return values._reduce(name, axis=1, skipna=skipna, **kwds) return values._reduce(name, skipna=skipna, **kwds) else: return op(values, axis=axis, skipna=skipna, **kwds) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c1a277925de2a..1df3e48cedd77 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -65,6 +65,7 @@ is_interval_dtype, is_numeric_dtype, is_scalar, + is_strict_ea, needs_i8_conversion, ) from pandas.core.dtypes.missing import ( @@ -82,10 +83,7 @@ validate_func_kwargs, ) from pandas.core.apply import GroupByApply -from pandas.core.arrays import ( - Categorical, - ExtensionArray, -) +from pandas.core.arrays import Categorical from pandas.core.base import ( DataError, SpecificationError, @@ -1115,7 +1113,7 @@ def py_fallback(bvalues: ArrayLike) -> ArrayLike: obj: FrameOrSeriesUnion # call our grouper again with only this block - if isinstance(bvalues, ExtensionArray): + if is_strict_ea(bvalues): # TODO(EA2D): special case not needed with 2D EAs obj = Series(bvalues) else: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 00cb65fff3803..06667a3ea9c57 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -71,6 +71,7 @@ ) import pandas.core.algorithms as algorithms +from pandas.core.arrays import ExtensionArray from pandas.core.base import SelectionMixin import pandas.core.common as com from pandas.core.frame import DataFrame @@ -208,7 +209,9 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): result_values = None sdata: FrameOrSeries = splitter._get_sorted_data() - if sdata.ndim == 2 and np.any(sdata.dtypes.apply(is_extension_array_dtype)): + if sdata.ndim == 2 and any( + isinstance(x, ExtensionArray) for x in sdata._iter_column_arrays() + ): # calling splitter.fast_apply will raise TypeError via apply_frame_axis0 # if we pass EA instead of ndarray # TODO: can we have a workaround for EAs backed by ndarray? diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 4150ec745bd2e..0097959245686 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -309,6 +309,10 @@ def astype(self, dtype, copy=True): return self return self.copy() + if isinstance(dtype, np.dtype) and dtype.kind == "M" and dtype != "M8[ns]": + # For now Datetime supports this by unwrapping ndarray, but DTI doesn't + raise TypeError(f"Cannot cast {type(self._data).__name__} to dtype") + new_values = self._data.astype(dtype, copy=copy) # pass copy=False because any copying will be done in the diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ae019c2f853a1..2185180e07abc 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -26,7 +26,6 @@ writers, ) from pandas._libs.internals import BlockPlacement -from pandas._libs.tslibs import conversion from pandas._typing import ( ArrayLike, Dtype, @@ -51,10 +50,12 @@ is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal, + is_ea_dtype, is_extension_array_dtype, is_list_like, is_object_dtype, is_sparse, + is_strict_ea, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -112,7 +113,6 @@ Float64Index, Index, ) - from pandas.core.arrays._mixins import NDArrayBackedExtensionArray class Block(PandasObject): @@ -276,7 +276,7 @@ def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: """ if is_object_dtype(dtype): return self.values.astype(object) - return self.values + return np.asarray(self.values) def get_block_values_for_json(self) -> np.ndarray: """ @@ -308,7 +308,7 @@ def make_block(self, values, placement=None) -> Block: if placement is None: placement = self.mgr_locs if self.is_extension: - values = _block_shape(values, ndim=self.ndim) + values = block_shape(values, ndim=self.ndim) return make_block(values, placement=placement, ndim=self.ndim) @@ -434,7 +434,10 @@ def _split_op_result(self, result) -> List[Block]: # if we get a 2D ExtensionArray, we need to split it into 1D pieces nbs = [] for i, loc in enumerate(self.mgr_locs): - vals = result[i] + if not is_strict_ea(result): + vals = result[i : i + 1] + else: + vals = result[i] block = self.make_block(values=vals, placement=[loc]) nbs.append(block) return nbs @@ -533,7 +536,7 @@ def make_a_block(nv, ref_loc): else: # Put back the dimension that was taken from it and make # a block out of the result. - nv = _block_shape(nv, ndim=self.ndim) + nv = block_shape(nv, ndim=self.ndim) block = self.make_block(values=nv, placement=ref_loc) return block @@ -674,6 +677,15 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): def _astype(self, dtype: DtypeObj, copy: bool) -> ArrayLike: values = self.values + if ( + values.dtype.kind in ["m", "M"] + and dtype.kind in ["i", "u"] + and dtype.itemsize != 8 + ): + # TODO(2.0) remove special case once deprecation on DTA/TDA is enforced + msg = rf"cannot astype a datetimelike from [{values.dtype}] to [{dtype}]" + raise TypeError(msg) + if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) @@ -988,11 +1000,6 @@ def setitem(self, indexer, value): # current dtype cannot store value, coerce to common dtype return self.coerce_to_target_dtype(value).setitem(indexer, value) - if self.dtype.kind in ["m", "M"]: - arr = self.array_values().T - arr[indexer] = value - return self - # value must be storable at this moment if is_extension_array_dtype(getattr(value, "dtype", None)): # We need to be careful not to allow through strings that @@ -1524,9 +1531,8 @@ def __init__(self, values, placement, ndim: int): ndim = 2 super().__init__(values, placement, ndim=ndim) - if self.ndim == 2 and len(self.mgr_locs) != 1: - # TODO(EA2D): check unnecessary with 2D EAs - raise AssertionError("block.size != values.size") + if self.ndim == 2 and len(self.mgr_locs) > 1: + raise ValueError("need to split... for now") @property def shape(self) -> Shape: @@ -1640,8 +1646,8 @@ def setitem(self, indexer, value): be a compatible shape. """ if not self._can_hold_element(value): - # This is only relevant for DatetimeTZBlock, which has a - # non-trivial `_can_hold_element`. + # This is only relevant for DatetimeTZBlock, ObjectValuesExtensionBlock, + # which has a non-trivial `_can_hold_element`. # https://github.com/pandas-dev/pandas/issues/24020 # Need a dedicated setitem until GH#24020 (type promotion in setitem # for extension arrays) is designed and implemented. @@ -1873,9 +1879,13 @@ def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block: result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) if not is_sparse(self.dtype): - # shape[0] should be 1 as long as EAs are 1D - assert result.shape == (1, len(qs)), result.shape - result = type(self.values)._from_factorized(result[0], self.values) + if self.values.ndim == 1: + # shape[0] should be 1 as long as EAs are 1D + assert result.shape == (1, len(qs)), result.shape + result = type(self.values)._from_factorized(result[0], self.values) + else: + # NDArrayBackedExtensionBlock, e.g. DatetimeTZBlock + result = type(self.values)._from_factorized(result, self.values) return make_block(result, placement=self.mgr_locs, ndim=2) @@ -1965,28 +1975,11 @@ def to_native_types( class NDArrayBackedExtensionBlock(HybridMixin, Block): """ - Block backed by an NDArrayBackedExtensionArray + Block backed by an NDarrayBackedExtensionArray, supporting 2D values. """ - def internal_values(self): - # Override to return DatetimeArray and TimedeltaArray - return self.array_values() - - def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: - """ - return object dtype as boxed values, such as Timestamps/Timedelta - """ - values = self.array_values() - if is_object_dtype(dtype): - # DTA/TDA constructor and astype can handle 2D - values = values.astype(object) - # TODO(EA2D): reshape not needed with 2D EAs - return np.asarray(values).reshape(self.shape) - - def iget(self, key): - # GH#31649 we need to wrap scalars in Timestamp/Timedelta - # TODO(EA2D): this can be removed if we ever have 2D EA - return self.array_values().reshape(self.shape)[key] + def array_values(self): + return self.values def putmask(self, mask, new) -> List[Block]: mask = extract_bool_array(mask) @@ -1994,28 +1987,47 @@ def putmask(self, mask, new) -> List[Block]: if not self._can_hold_element(new): return self.astype(object).putmask(mask, new) - # TODO(EA2D): reshape unnecessary with 2D EAs - arr = self.array_values().reshape(self.shape) - arr = cast("NDArrayBackedExtensionArray", arr) + arr = self.values arr.T.putmask(mask, new) return [self] def where(self, other, cond, errors="raise", axis: int = 0) -> List[Block]: - # TODO(EA2D): reshape unnecessary with 2D EAs - arr = self.array_values().reshape(self.shape) - - cond = extract_bool_array(cond) + arr = self.values try: res_values = arr.T.where(cond, other).T except (ValueError, TypeError): - return super().where(other, cond, errors=errors, axis=axis) + return Block.where(self, other, cond, errors=errors, axis=axis) - # TODO(EA2D): reshape not needed with 2D EAs - res_values = res_values.reshape(self.values.shape) nb = self.make_block_same_class(res_values) return [nb] + @property + def is_view(self) -> bool: + """ return a boolean if I am possibly a view """ + # check the ndarray values of the DatetimeIndex values + return self.values._data.base is not None + + def setitem(self, indexer, value): + if not self._can_hold_element(value): + # TODO: general case needs casting logic. + return self.astype(object).setitem(indexer, value) + + transpose = self.ndim == 2 + + values = self.values + if transpose: + values = values.T + values[indexer] = value + return self + + def delete(self, loc) -> None: + """ + Delete given loc(-s) from block in-place. + """ + self.values = self.values.delete(loc, 0) + self.mgr_locs = self.mgr_locs.delete(loc) + def diff(self, n: int, axis: int = 0) -> List[Block]: """ 1st discrete difference. @@ -2036,22 +2048,37 @@ def diff(self, n: int, axis: int = 0) -> List[Block]: The arguments here are mimicking shift so they are called correctly by apply. """ - # TODO(EA2D): reshape not necessary with 2D EAs - values = self.array_values().reshape(self.shape) + values = self.values new_values = values - values.shift(n, axis=axis) return [self.make_block(new_values)] def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> List[Block]: - # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs - values = self.array_values().reshape(self.shape) + values = self.values new_values = values.shift(periods, fill_value=fill_value, axis=axis) return [self.make_block_same_class(new_values)] + def fillna( + self, value, limit=None, inplace: bool = False, downcast=None + ) -> List[Block]: + + if not self._can_hold_element(value) and self.dtype.kind != "m": + # We support filling a DatetimeTZ with a `value` whose timezone + # is different by coercing to object. + # TODO: don't special-case td64 + return self.astype(object).fillna(value, limit, inplace, downcast) + + values = self.values + values = values if inplace else values.copy() + new_values = values.fillna(value=value, limit=limit) + return [self.make_block_same_class(values=new_values)] + class DatetimeLikeBlockMixin(NDArrayBackedExtensionBlock): """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" + is_numeric = False + _can_hold_na = True _dtype: np.dtype _holder: Type[Union[DatetimeArray, TimedeltaArray]] @@ -2072,30 +2099,30 @@ def _maybe_coerce_values(cls, values): Overridden by DatetimeTZBlock. """ - if values.dtype != cls._dtype: + if cls.fill_value is not NaT and values.dtype != cls._dtype: # non-nano we will convert to nano if values.dtype.kind != cls._dtype.kind: # caller is responsible for ensuring td64/dt64 dtype raise TypeError(values.dtype) # pragma: no cover - values = cls._holder._from_sequence(values)._data + values = cls._holder._from_sequence(values) - if isinstance(values, cls._holder): - values = values._data + if not isinstance(values, cls._holder): + values = cls._holder(values) - assert isinstance(values, np.ndarray), type(values) return values - def array_values(self): - return self._holder._simple_new(self.values) - def to_native_types(self, na_rep="NaT", **kwargs): """ convert to our native types format """ - arr = self.array_values() - + arr = self.values result = arr._format_native_types(na_rep=na_rep, **kwargs) return self.make_block(result) + def external_values(self): + # NB: for dt64tz this is different from np.asarray(self.values), + # since that return an object-dtype ndarray of Timestamps. + return self.values._data + class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () @@ -2104,104 +2131,42 @@ class DatetimeBlock(DatetimeLikeBlockMixin): _dtype = fill_value.dtype _holder = DatetimeArray - @property - def _can_hold_na(self): - return True - - def set_inplace(self, locs, values): - """ - See Block.set.__doc__ - """ - values = conversion.ensure_datetime64ns(values, copy=False) - - self.values[locs] = values - -class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): +class DatetimeTZBlock(DatetimeBlock, ExtensionBlock): """ implement a datetime64 block with a tz attribute """ values: DatetimeArray __slots__ = () is_extension = True + _validate_ndim = True + _can_consolidate = True _holder = DatetimeArray - - internal_values = Block.internal_values - _can_hold_element = DatetimeBlock._can_hold_element - to_native_types = DatetimeBlock.to_native_types - diff = DatetimeBlock.diff fill_value = NaT - where = DatetimeBlock.where - putmask = DatetimeLikeBlockMixin.putmask - - array_values = ExtensionBlock.array_values - @classmethod - def _maybe_coerce_values(cls, values): - """ - Input validation for values passed to __init__. Ensure that - we have datetime64TZ, coercing if necessary. + get_values = Block.get_values + set_inplace = Block.set_inplace + iget = Block.iget + _slice = Block._slice + shape = Block.shape + __init__ = Block.__init__ + take_nd = Block.take_nd + _unstack = Block._unstack - Parameters - ---------- - values : array-like - Must be convertible to datetime64 - - Returns - ------- - values : DatetimeArray - """ - if not isinstance(values, cls._holder): - values = cls._holder(values) - - if values.tz is None: - raise ValueError("cannot create a DatetimeTZBlock without a tz") - - return values - - @property - def is_view(self) -> bool: - """ return a boolean if I am possibly a view """ - # check the ndarray values of the DatetimeIndex values - return self.values._data.base is not None - - def external_values(self): - # NB: this is different from np.asarray(self.values), since that - # return an object-dtype ndarray of Timestamps. - # Avoid FutureWarning in .astype in casting from dt64tz to dt64 - return self.values._data - - def fillna( - self, value, limit=None, inplace: bool = False, downcast=None - ) -> List[Block]: - # We support filling a DatetimeTZ with a `value` whose timezone - # is different by coercing to object. - if self._can_hold_element(value): - return super().fillna(value, limit, inplace, downcast) - - # different timezones, or a non-tz - return self.astype(object).fillna( - value, limit=limit, inplace=inplace, downcast=downcast - ) + # TODO: we still share these with ExtensionBlock (and not DatetimeBlock) + # ['interpolate', 'quantile'] + # [x for x in dir(DatetimeTZBlock) if hasattr(ExtensionBlock, x) + # and getattr(DatetimeTZBlock, x) is getattr(ExtensionBlock, x) + # and getattr(ExtensionBlock, x) is not getattr(Block, x)] class TimeDeltaBlock(DatetimeLikeBlockMixin): __slots__ = () - _can_hold_na = True - is_numeric = False _holder = TimedeltaArray fill_value = np.timedelta64("NaT", "ns") _dtype = fill_value.dtype - def fillna( - self, value, limit=None, inplace: bool = False, downcast=None - ) -> List[Block]: - values = self.array_values() - values = values if inplace else values.copy() - new_values = values.fillna(value=value, limit=limit) - return [self.make_block_same_class(values=new_values)] - class ObjectBlock(Block): __slots__ = () @@ -2277,7 +2242,8 @@ def f(mask, val, idx): if isinstance(values, np.ndarray): # TODO(EA2D): allow EA once reshape is supported values = values.reshape(shape) - + if isinstance(values, ABCIndex): + values = values._data return values if self.ndim == 2: @@ -2408,6 +2374,10 @@ def make_block( # TODO: This is no longer hit internally; does it need to be retained # for e.g. pyarrow? values = DatetimeArray._simple_new(values, dtype=dtype) + placement = BlockPlacement(placement) + if len(placement) == 1 and values.ndim == 1 and (ndim is None or ndim == 2): + # TODO: unnecessary after https://github.com/apache/arrow/pull/8957 + values = values.reshape(1, -1) return klass(values, ndim=ndim, placement=placement) @@ -2431,11 +2401,11 @@ def extend_blocks(result, blocks=None) -> List[Block]: return blocks -def _block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: +def block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: """ guarantee the shape of the values to be at least 1 d """ if values.ndim < ndim: shape = values.shape - if not is_extension_array_dtype(values.dtype): + if not is_strict_ea(values): # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023 # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. @@ -2459,8 +2429,10 @@ def safe_reshape(arr: ArrayLike, new_shape: Shape) -> ArrayLike: ------- np.ndarray or ExtensionArray """ - if not is_extension_array_dtype(arr.dtype): + if not is_ea_dtype(arr.dtype): # Note: this will include TimedeltaArray and tz-naive DatetimeArray # TODO(EA2D): special case will be unnecessary with 2D EAs - arr = np.asarray(arr).reshape(new_shape) + arr = extract_array(arr, extract_numpy=True).reshape(new_shape) + if type(arr).__name__ == "PandasArray": + arr = arr.to_numpy() return arr diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index aa3545b83dfe3..a28436a9a82b8 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -27,8 +27,10 @@ from pandas.core.dtypes.common import ( is_datetime64tz_dtype, is_dtype_equal, + is_ea_dtype, is_extension_array_dtype, is_sparse, + is_strict_ea, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.missing import ( @@ -108,9 +110,14 @@ def concatenate_block_managers( values = np.concatenate(vals, axis=blk.ndim - 1) else: # TODO(EA2D): special-casing not needed with 2D EAs - values = concat_compat(vals) - if not isinstance(values, ExtensionArray): - values = values.reshape(1, len(values)) + if all(x.ndim == blk.ndim for x in vals): + # i.e. DTA/TDA + values = concat_compat(vals, axis=blk.ndim - 1) + else: + values = concat_compat(vals) + + if not is_strict_ea(values) and blk.ndim == 2 and values.ndim == 1: + values = values.reshape(1, -1) if blk.values.dtype == values.dtype: # Fast-path @@ -118,11 +125,8 @@ def concatenate_block_managers( else: b = make_block(values, placement=placement, ndim=blk.ndim) else: - b = make_block( - _concatenate_join_units(join_units, concat_axis, copy=copy), - placement=placement, - ndim=len(axes), - ) + new_values = _concatenate_join_units(join_units, concat_axis, copy=copy) + b = make_block(new_values, placement=placement, ndim=len(axes)) blocks.append(b) return BlockManager(blocks, axes) @@ -310,12 +314,11 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: fill_value = None if is_datetime64tz_dtype(empty_dtype): - # TODO(EA2D): special case unneeded with 2D EAs - i8values = np.full(self.shape[1], fill_value.value) + i8values = np.full(self.shape, fill_value.value) return DatetimeArray(i8values, dtype=empty_dtype) elif is_extension_array_dtype(blk_dtype): pass - elif is_extension_array_dtype(empty_dtype): + elif is_ea_dtype(empty_dtype): cls = empty_dtype.construct_array_type() missing_arr = cls._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape @@ -378,6 +381,7 @@ def _concatenate_join_units( ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) for ju in join_units ] + ndim = max(x.ndim for x in to_concat) if len(to_concat) == 1: # Only one block, nothing to concatenate. @@ -390,17 +394,24 @@ def _concatenate_join_units( concat_values = concat_values.copy() else: concat_values = concat_values.copy() + elif any(isinstance(t, ExtensionArray) for t in to_concat): + # TODO(EA2D): special case not needed if all EAs used HybridBlocks + # NB: we are still assuming here that Hybrid blocks have shape (1, N) # concatting with at least one EA means we are concatting a single column # the non-EA values are 2D arrays with shape (1, n) - to_concat = [t if isinstance(t, ExtensionArray) else t[0, :] for t in to_concat] + to_concat = [t if is_strict_ea(t) else t[0, :] for t in to_concat] concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) - if not is_extension_array_dtype(concat_values.dtype): + # TODO: what if we have dt64tz blocks with more than 1 column? + + if concat_values.ndim < ndim and not is_strict_ea(concat_values): # if the result of concat is not an EA but an ndarray, reshape to # 2D to put it a non-EA Block - # special case DatetimeArray/TimedeltaArray, which *is* an EA, but - # is put in a consolidated 2D block - concat_values = np.atleast_2d(concat_values) + # special case DatetimeArray, which *is* an EA, but is put in a + # consolidated 2D block + # TODO(EA2D): we could just get this right within concat_compat + concat_values = concat_values.reshape(1, -1) + else: concat_values = concat_compat(to_concat, axis=concat_axis) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2cfe613b7072b..7a41fd277293d 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -40,6 +40,7 @@ from pandas.core.dtypes.common import ( is_datetime64tz_dtype, is_dtype_equal, + is_ea_dtype, is_extension_array_dtype, is_integer_dtype, is_list_like, @@ -58,7 +59,10 @@ algorithms, common as com, ) -from pandas.core.arrays import Categorical +from pandas.core.arrays import ( + Categorical, + ExtensionArray, +) from pandas.core.construction import ( extract_array, sanitize_array, @@ -70,6 +74,7 @@ get_objs_combined_axis, union_indexes, ) +from pandas.core.internals.blocks import block_shape from pandas.core.internals.managers import ( create_block_manager_from_arrays, create_block_manager_from_blocks, @@ -210,7 +215,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) - if is_extension_array_dtype(values) or is_extension_array_dtype(dtype): + if is_ea_dtype(values) or is_extension_array_dtype(dtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: @@ -225,9 +230,18 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): return arrays_to_mgr(values, columns, index, columns, dtype=dtype) - # by definition an array here - # the dtypes will be coerced to a single dtype - values = _prep_ndarray(values, copy=copy) + if is_datetime64tz_dtype(values): + # TODO: combine into _prep_ndarray? + values = extract_array(values, extract_numpy=True) + if copy: + values = values.copy() + if values.ndim == 1: + values = values.reshape(-1, 1) + + else: + # by definition an array here + # the dtypes will be coerced to a single dtype + values = _prep_ndarray(values, copy=copy) if dtype is not None and not is_dtype_equal(values.dtype, dtype): try: @@ -254,10 +268,12 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): if values.ndim == 2 and values.shape[0] != 1: # transpose and separate blocks + # TODO: do this in one go dvals_list = [maybe_infer_to_datetimelike(row) for row in values] + dvals_list = [extract_array(x, extract_numpy=True) for x in dvals_list] + # TODO: unpack DatetimeIndex directly in maybe_infer_to_datetimelike for n in range(len(dvals_list)): - if isinstance(dvals_list[n], np.ndarray): - dvals_list[n] = dvals_list[n].reshape(1, -1) + dvals_list[n] = dvals_list[n].reshape(1, -1) from pandas.core.internals.blocks import make_block @@ -270,6 +286,10 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): else: datelike_vals = maybe_infer_to_datetimelike(values) block_values = [datelike_vals] + block_values = [extract_array(x, extract_numpy=True) for x in block_values] + if values.ndim == 2: + block_values = [block_shape(x, 2) for x in block_values] + else: block_values = [values] @@ -288,7 +308,6 @@ def init_dict(data: Dict, index, columns, dtype: Optional[DtypeObj] = None): arrays = Series(data, index=columns, dtype=object) data_names = arrays.index - missing = arrays.isna() if index is None: # GH10856 @@ -356,7 +375,12 @@ def treat_as_nested(data) -> bool: """ Check if we should use nested_data_to_arrays. """ - return len(data) > 0 and is_list_like(data[0]) and getattr(data[0], "ndim", 1) == 1 + return ( + len(data) > 0 + and is_list_like(data[0]) + and getattr(data[0], "ndim", 1) == 1 + and not (isinstance(data, ExtensionArray) and data.ndim == 2) + ) # --------------------------------------------------------------------- diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b3f0466f236b6..51e6a7ecb7e0b 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -38,7 +38,9 @@ ) from pandas.core.dtypes.common import ( DT64NS_DTYPE, + is_datetime64tz_dtype, is_dtype_equal, + is_ea_dtype, is_extension_array_dtype, is_list_like, ) @@ -69,6 +71,7 @@ DatetimeTZBlock, ExtensionBlock, ObjectValuesExtensionBlock, + block_shape, extend_blocks, get_block_type, make_block, @@ -310,6 +313,16 @@ def unpickle_block(values, mgr_locs, ndim: int): state = state[3]["0.14.1"] self.axes = [ensure_index(ax) for ax in state["axes"]] ndim = len(self.axes) + + for blk in state["blocks"]: + vals = blk["values"] + if is_datetime64tz_dtype(vals.dtype): + # older versions will hold in DatetimeIndex instead of DTA + vals = extract_array(vals, extract_numpy=True) + if vals.ndim == 1 and ndim == 2: + vals = vals.reshape(1, -1) + blk["values"] = vals + self.blocks = tuple( unpickle_block(b["values"], b["mgr_locs"], ndim=ndim) for b in state["blocks"] @@ -1029,7 +1042,7 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): if self._blklocs is None and self.ndim > 1: self._rebuild_blknos_and_blklocs() - value_is_extension_type = is_extension_array_dtype(value) + value_is_extension_type = is_ea_dtype(value) # categorical/sparse/datetimetz if value_is_extension_type: @@ -1165,7 +1178,7 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False if value.ndim == 2: value = value.T - elif value.ndim == self.ndim - 1 and not is_extension_array_dtype(value.dtype): + elif value.ndim == self.ndim - 1: # TODO(EA2D): special case not needed with 2D EAs value = safe_reshape(value, (1,) + value.shape) @@ -1715,7 +1728,12 @@ def _form_blocks(arrays, names: Index, axes: List[Index]) -> List[Block]: if len(items_dict["DatetimeTZBlock"]): dttz_blocks = [ - make_block(array, klass=DatetimeTZBlock, placement=i, ndim=2) + make_block( + block_shape(extract_array(array), 2), + klass=DatetimeTZBlock, + placement=i, + ndim=2, + ) for i, array in items_dict["DatetimeTZBlock"] ] blocks.extend(dttz_blocks) @@ -1864,7 +1882,13 @@ def _merge_blocks( # TODO: optimization potential in case all mgrs contain slices and # combination of those slices is a slice, too. new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) - new_values = np.vstack([b.values for b in blocks]) + + if isinstance(blocks[0].dtype, np.dtype): + new_values = np.vstack([b.values for b in blocks]) + else: + new_values = blocks[0].values._concat_same_type( + [b.values for b in blocks], axis=0 + ) argsort = np.argsort(new_mgr_locs) new_values = new_values[argsort] diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 70d4f3b91c245..3f018d5e802f7 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -8,8 +8,6 @@ Tuple, ) -import numpy as np - from pandas._typing import ArrayLike if TYPE_CHECKING: @@ -32,7 +30,7 @@ def _iter_block_pairs( locs = blk.mgr_locs blk_vals = blk.values - left_ea = not isinstance(blk_vals, np.ndarray) + left_ea = blk_vals.ndim == 1 rblks = right._slice_take_blocks_ax0(locs.indexer, only_slice=True) @@ -43,7 +41,7 @@ def _iter_block_pairs( # assert rblks[0].shape[0] == 1, rblks[0].shape for k, rblk in enumerate(rblks): - right_ea = not isinstance(rblk.values, np.ndarray) + right_ea = rblk.values.ndim == 1 lvals, rvals = _get_same_shape_values(blk, rblk, left_ea, right_ea) info = BlockPairInfo(lvals, rvals, locs, left_ea, right_ea, rblk) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 9ae5f7d1b7497..98ab01c46ce23 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -729,9 +729,11 @@ def _backfill_2d(values, limit=None, mask=None): _fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d} -def get_fill_func(method): +def get_fill_func(method, ndim: int = 1): method = clean_fill_method(method) - return _fill_methods[method] + if ndim == 1: + return _fill_methods[method] + return {"pad": _pad_2d, "backfill": _backfill_2d}[method] def clean_reindex_fill_method(method): diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 67863036929b3..aa7e9b400f475 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -396,7 +396,7 @@ def nargsort( return indexer -def nargminmax(values, method: str): +def nargminmax(values, method: str, axis: int = 0): """ Implementation of np.argmin/argmax but for ExtensionArray and which handles missing values. @@ -405,6 +405,7 @@ def nargminmax(values, method: str): ---------- values : ExtensionArray method : {"argmax", "argmin"} + axis : int, default 0 Returns ------- @@ -416,11 +417,17 @@ def nargminmax(values, method: str): mask = np.asarray(isna(values)) values = values._values_for_argsort() - idx = np.arange(len(values)) + idx = np.arange(values.shape[axis]) + if values.ndim > 1 and values.size > 0: + # values.size check is a kludge bc JSONArray can come back with size-0 2D + if mask.any(): + assert False # just checking + return func(values, axis=axis) + non_nans = values[~mask] non_nan_idx = idx[~mask] - return non_nan_idx[func(non_nans)] + return non_nan_idx[func(non_nans, axis=axis)] def _ensure_key_mapped_multiindex( diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 88b444acfea62..42646f0e7c8a1 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4554,9 +4554,13 @@ def read( # if we have a DataIndexableCol, its shape will only be 1 dim if values.ndim == 1 and isinstance(values, np.ndarray): + # TODO(EA2D): special case not needed with 2D EAs values = values.reshape((1, values.shape[0])) + if isinstance(values, DatetimeIndex) and len(cols_) != 1: + # FIXME: kludge + values = values._data.reshape(len(cols_), -1, order="F") - if isinstance(values, np.ndarray): + if isinstance(values, (np.ndarray, DatetimeArray)): df = DataFrame(values.T, columns=cols_, index=index_) elif isinstance(values, Index): df = DataFrame(values, columns=cols_, index=index_) diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py index 423d20c46bdb1..92ef97654dd8d 100644 --- a/pandas/tests/arithmetic/test_datetime64.py +++ b/pandas/tests/arithmetic/test_datetime64.py @@ -1020,10 +1020,7 @@ def test_dt64arr_sub_dt64object_array(self, box_with_array, tz_naive_fixture): obj = tm.box_expected(dti, box_with_array) expected = tm.box_expected(expected, box_with_array) - warn = None - if box_with_array is not pd.DataFrame or tz_naive_fixture is None: - warn = PerformanceWarning - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): result = obj - obj.astype(object) tm.assert_equal(result, expected) @@ -1525,10 +1522,7 @@ def test_dt64arr_add_sub_offset_array( if box_other: other = tm.box_expected(other, box_with_array) - warn = PerformanceWarning - if box_with_array is pd.DataFrame and tz is not None: - warn = None - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): res = op(dtarr, other) tm.assert_equal(res, expected) @@ -2469,18 +2463,14 @@ def test_dti_addsub_object_arraylike( expected = DatetimeIndex(["2017-01-31", "2017-01-06"], tz=tz_naive_fixture) expected = tm.box_expected(expected, xbox) - warn = PerformanceWarning - if box_with_array is pd.DataFrame and tz is not None: - warn = None - - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): result = dtarr + other tm.assert_equal(result, expected) expected = DatetimeIndex(["2016-12-31", "2016-12-29"], tz=tz_naive_fixture) expected = tm.box_expected(expected, xbox) - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(PerformanceWarning): result = dtarr - other tm.assert_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 46f5a20f38941..35e958ff3a2b1 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -432,19 +432,11 @@ def test_astype_to_incorrect_datetimelike(self, unit): other = f"m8[{unit}]" df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) - msg = ( - fr"cannot astype a datetimelike from \[datetime64\[ns\]\] to " - fr"\[timedelta64\[{unit}\]\]" - fr"|(Cannot cast DatetimeArray to dtype timedelta64\[{unit}\])" - ) + msg = fr"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]" with pytest.raises(TypeError, match=msg): df.astype(other) - msg = ( - fr"cannot astype a timedelta from \[timedelta64\[ns\]\] to " - fr"\[datetime64\[{unit}\]\]" - fr"|(Cannot cast TimedeltaArray to dtype datetime64\[{unit}\])" - ) + msg = fr"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]" df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype) diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 430abd9700a23..08ffb9dea36e0 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -103,7 +103,8 @@ def test_set_index_dst(self): # single level res = df.set_index("index") exp = DataFrame( - data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=Index(di, name="index") + data={"a": [0, 1, 2], "b": [3, 4, 5]}, + index=Index(di, name="index")._with_freq(None), ) tm.assert_frame_equal(res, exp) diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 193f1617fbb55..c28a2cd9374bc 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -39,7 +39,7 @@ def test_setitem_invalidates_datetime_index_freq(self): ts = dti[1] df = DataFrame({"B": dti}) - assert df["B"]._values.freq == "D" + assert df["B"]._values.freq is None df.iloc[1, 0] = pd.NaT assert df["B"]._values.freq is None diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 4bbdba9fedbff..f9bc03d4745da 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1003,7 +1003,6 @@ def test_apply_function_with_indexing_return_column(): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(reason="GH-34998") def test_apply_with_timezones_aware(): # GH: 27212 diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4dce7e8553be4..e0fe1b5bef288 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -838,7 +838,7 @@ def test_omit_nuisance(df): # won't work with axis = 1 grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) - msg = "reduction operation 'sum' not allowed for this dtype" + msg = "'DatetimeArray' does not implement reduction 'sum'" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index a24c711df7b55..e7b79d920af0e 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -124,6 +124,8 @@ def create_block(typestr, placement, item_shape=None, num_offset=0): tz = m.groups()[0] assert num_items == 1, "must have only 1 num items for a tz-aware" values = DatetimeIndex(np.arange(N) * 1e9, tz=tz) + if len(shape) == 2: + values = values._data.reshape(1, -1) elif typestr in ("timedelta", "td", "m8[ns]"): values = (mat * 1).astype("m8[ns]") elif typestr in ("category",): @@ -425,7 +427,11 @@ def test_copy(self, mgr): # copy assertion we either have a None for a base or in case of # some blocks it is an array (e.g. datetimetz), but was copied tm.assert_equal(cp_blk.values, blk.values) - if not isinstance(cp_blk.values, np.ndarray): + if isinstance(cp_blk.values, DatetimeArray): + assert ( + cp_blk.values._data.base is None and blk.values._data.base is None + ) or (cp_blk.values._data.base is not blk.values._data.base) + elif not isinstance(cp_blk.values, np.ndarray): assert cp_blk.values._data.base is not blk.values._data.base else: assert cp_blk.values.base is None and blk.values.base is None @@ -490,7 +496,7 @@ def test_astype(self, t): mgr = create_mgr("a,b: object; c: bool; d: datetime; e: f4; f: f2; g: f8") t = np.dtype(t) - with tm.assert_produces_warning(warn): + with tm.assert_produces_warning(warn, check_stacklevel=False): tmgr = mgr.astype(t, errors="ignore") assert tmgr.iget(2).dtype.type == t assert tmgr.iget(4).dtype.type == t @@ -563,10 +569,10 @@ def _compare(old_mgr, new_mgr): assert new_mgr.iget(8).dtype == np.float16 def test_invalid_ea_block(self): - with pytest.raises(AssertionError, match="block.size != values.size"): + with pytest.raises(ValueError, match="need to split"): create_mgr("a: category; b: category") - with pytest.raises(AssertionError, match="block.size != values.size"): + with pytest.raises(ValueError, match="need to split"): create_mgr("a: category2; b: category2") def test_interleave(self): diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index 489352380b186..66aa1afb65a00 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -97,6 +97,7 @@ def test_append_with_timezones_dateutil(setup_path): r"existing_value \[dateutil/.*US/Eastern\] " r"conflicts with new value \[dateutil/.*EET\]" ) + msg = r"cannot match existing table structure for \[A,B\] on appending data" with pytest.raises(ValueError, match=msg): store.append("df_tz", df) @@ -195,6 +196,7 @@ def test_append_with_timezones_pytz(setup_path): r"invalid info for \[values_block_1\] for \[tz\], " r"existing_value \[US/Eastern\] conflicts with new value \[EET\]" ) + msg = r"cannot match existing table structure for \[A,B\] on appending data" with pytest.raises(ValueError, match=msg): store.append("df_tz", df) @@ -214,6 +216,7 @@ def test_append_with_timezones_pytz(setup_path): index=range(5), ) + # TODO: can we share this with other similar tests? msg = ( r"invalid info for \[B\] for \[tz\], " r"existing_value \[EET\] conflicts with new value \[CET\]" diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 320179c0a8b0a..37af46cdb1f51 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -59,7 +59,7 @@ def test_isin_with_i8(self): tm.assert_series_equal(result, expected) # fails on dtype conversion in the first place - result = s.isin(s[0:2].values.astype("datetime64[D]")) + result = s.isin(np.asarray(s[0:2]).astype("datetime64[D]")) tm.assert_series_equal(result, expected) result = s.isin([s[1]]) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1daeee8645f2e..bdf76d7c5b0a4 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1313,12 +1313,12 @@ def test_constructor_dtype_timedelta64(self): # td.astype('m8[%s]' % t) # valid astype - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): # astype(int64) deprecated td.astype("int64") # invalid casting - msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to \[int32\]" + msg = r"cannot astype a datetimelike from \[timedelta64\[ns\]\] to \[int32\]" with pytest.raises(TypeError, match=msg): td.astype("int32") From 35cdd16c511a991031a4ad6dd46b0560f08ba5fe Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 17 Feb 2021 10:04:59 -0800 Subject: [PATCH 02/61] REF: _values_compat --- pandas/core/frame.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 492e308710793..b2957137cd6c5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -762,6 +762,25 @@ def _can_fast_transpose(self) -> bool: # TODO(EA2D) special case would be unnecessary with 2D EAs return not is_ea_dtype(dtype) + @property + def _values_compat(self) -> ArrayLike: + """ + Analogue to ._values that may return a 2D ExtensionArray. + """ + mgr = self._mgr + if isinstance(mgr, ArrayManager): + return self._values + + blocks = mgr.blocks + if len(blocks) != 1: + return self._values + + arr = blocks[0].values + if arr.ndim == 1: + # non-2D ExtensionArray + return self._values + return arr.T + # ---------------------------------------------------------------------- # Rendering Methods @@ -2986,13 +3005,13 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: # construct the args dtypes = list(self.dtypes) - if self._mgr.nblocks == 1 and not is_strict_ea(self._mgr.blocks[0].values): + + if self._can_fast_transpose: # Note: tests pass without this, but this improves perf quite a bit. - # TODO: something like frame.values but that _may_ give an EA - blk = self._mgr.blocks[0] - new_values = blk.values + new_values = self._values_compat.T if copy: new_values = new_values.copy() + result = self._constructor( new_values, index=self.columns, columns=self.index ) From 213afb39f29e55b0fd5a7874fb78ac8fde08571a Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 17 Feb 2021 16:16:07 -0800 Subject: [PATCH 03/61] test, asv --- asv_bench/benchmarks/reshape.py | 36 ++++++++++++++++++++ pandas/core/reshape/reshape.py | 5 +-- pandas/tests/frame/methods/test_transpose.py | 13 +++++++ 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index faee9bc57464b..232aabfb87c58 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -53,6 +53,42 @@ def time_unstack(self): self.df.unstack(1) +class ReshapeExtensionDtype: + + params = ["datetime64[ns, US/Pacific]", "Period[s]"] + param_names = ["dtype"] + + def setup(self, dtype): + lev = pd.Index(list("ABCDEFGHIJ")) + ri = pd.Index(range(1000)) + mi = MultiIndex.from_product([lev, ri], names=["foo", "bar"]) + + index = date_range("2016-01-01", periods=10000, freq="s", tz="US/Pacific") + if dtype == "Period[s]": + index = index.tz_localize(None).to_period("s") + + ser = pd.Series(index, index=mi) + df = ser.unstack("bar") + # roundtrips -> df.stack().equals(ser) + + self.ser = ser + self.df = df + + def time_stack(self, dtype): + self.df.stack() + + def time_unstack_fast(self, dtype): + # last level -> doesnt have to make copies + self.ser.unstack("bar") + + def time_unstack_slow(self, dtype): + # first level -> must make copies + self.ser.unstack("foo") + + def time_transpose(self, dtype): + self.df.T + + class Unstack: params = ["int", "category"] diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 4ccdbc089a058..5a76710f3b302 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.common import ( ensure_platform_int, is_bool_dtype, + is_ea_dtype, is_extension_array_dtype, is_integer, is_integer_dtype, @@ -435,13 +436,13 @@ def unstack(obj, level, fill_value=None): f"index must be a MultiIndex to unstack, {type(obj.index)} was passed" ) else: - if is_extension_array_dtype(obj.dtype): + if is_ea_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker( obj.index, level=level, constructor=obj._constructor_expanddim ) return unstacker.get_result( - obj.values, value_columns=None, fill_value=fill_value + obj._values, value_columns=None, fill_value=fill_value ) diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index d6ab3268c8c37..62537d37a8c11 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -90,3 +90,16 @@ def test_transpose_get_view(self, float_frame): dft.values[:, 5:10] = 5 assert (float_frame.values[5:10] == 5).all() + + @td.skip_array_manager_invalid_test + def test_transpose_get_view_dt64tzget_view(self): + dti = date_range("2016-01-01", periods=6, tz="US/Pacific") + arr = dti._data.reshape(3, 2) + df = DataFrame(arr) + assert df._mgr.nblocks == 1 + + result = df.T + assert result._mgr.nblocks == 1 + + rtrip = result._mgr.blocks[0].values + assert np.shares_memory(arr._data, rtrip._data) From 52fa07aa46633dd283da812fd441f58edee2c568 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 17 Feb 2021 18:01:47 -0800 Subject: [PATCH 04/61] TST: port Dim2CompatTests --- pandas/core/indexes/extension.py | 4 +- pandas/core/ops/mask_ops.py | 2 +- pandas/tests/extension/base/__init__.py | 1 + pandas/tests/extension/base/dim2.py | 217 ++++++++++++++++++++++++ pandas/tests/extension/test_datetime.py | 4 + pandas/tests/extension/test_numpy.py | 4 + pandas/tests/extension/test_period.py | 4 + 7 files changed, 233 insertions(+), 3 deletions(-) create mode 100644 pandas/tests/extension/base/dim2.py diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 4150ec745bd2e..301fe51d0f37e 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -240,8 +240,8 @@ def __getitem__(self, key): return type(self)(result, name=self.name) # Unpack to ndarray for MPL compat - # error: "ExtensionArray" has no attribute "_data" - result = result._data # type: ignore[attr-defined] + # error: "ExtensionArray" has no attribute "_ndarray" + result = result._ndarray # type: ignore[attr-defined] # Includes cases where we get a 2D ndarray back for MPL compat deprecate_ndim_indexing(result) diff --git a/pandas/core/ops/mask_ops.py b/pandas/core/ops/mask_ops.py index a9edb2d138246..501bc0159e641 100644 --- a/pandas/core/ops/mask_ops.py +++ b/pandas/core/ops/mask_ops.py @@ -179,6 +179,6 @@ def kleene_and( return result, mask -def raise_for_nan(value, method): +def raise_for_nan(value, method: str): if lib.is_float(value) and np.isnan(value): raise ValueError(f"Cannot perform logical '{method}' with floating NaN") diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py index 9cf3bdab40d0b..910b43a2cd148 100644 --- a/pandas/tests/extension/base/__init__.py +++ b/pandas/tests/extension/base/__init__.py @@ -43,6 +43,7 @@ class TestMyDtype(BaseDtypeTests): """ from pandas.tests.extension.base.casting import BaseCastingTests # noqa from pandas.tests.extension.base.constructors import BaseConstructorsTests # noqa +from pandas.tests.extension.base.dim2 import Dim2CompatTests # noqa from pandas.tests.extension.base.dtype import BaseDtypeTests # noqa from pandas.tests.extension.base.getitem import BaseGetitemTests # noqa from pandas.tests.extension.base.groupby import BaseGroupbyTests # noqa diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py new file mode 100644 index 0000000000000..c6455ce15533a --- /dev/null +++ b/pandas/tests/extension/base/dim2.py @@ -0,0 +1,217 @@ +""" +Tests for 2D compatibility. +""" +import numpy as np +import pytest + +from pandas.compat import np_version_under1p17 + +import pandas as pd +from pandas.core.arrays import ( + FloatingArray, + IntegerArray, +) +from pandas.tests.extension.base.base import BaseExtensionTests + + +def maybe_xfail_masked_reductions(arr, request): + if ( + isinstance(arr, (FloatingArray, IntegerArray)) + and np_version_under1p17 + and arr.ndim == 2 + ): + mark = pytest.mark.xfail(reason="masked_reductions does not implement") + request.node.add_marker(mark) + + +class Dim2CompatTests(BaseExtensionTests): + def test_take_2d(self, data): + arr2d = data.reshape(-1, 1) + + result = arr2d.take([0, 0, -1], axis=0) + + expected = data.take([0, 0, -1]).reshape(-1, 1) + self.assert_extension_array_equal(result, expected) + + def test_repr_2d(self, data): + # this could fail in a corner case where an element contained the name + res = repr(data.reshape(1, -1)) + assert res.count(f"<{type(data).__name__}") == 1 + + res = repr(data.reshape(-1, 1)) + assert res.count(f"<{type(data).__name__}") == 1 + + def test_reshape(self, data): + arr2d = data.reshape(-1, 1) + assert arr2d.shape == (data.size, 1) + assert len(arr2d) == len(data) + + arr2d = data.reshape((-1, 1)) + assert arr2d.shape == (data.size, 1) + assert len(arr2d) == len(data) + + with pytest.raises(ValueError): + data.reshape((data.size, 2)) + with pytest.raises(ValueError): + data.reshape(data.size, 2) + + def test_getitem_2d(self, data): + arr2d = data.reshape(1, -1) + + result = arr2d[0] + self.assert_extension_array_equal(result, data) + + with pytest.raises(IndexError): + arr2d[1] + + with pytest.raises(IndexError): + arr2d[-2] + + result = arr2d[:] + self.assert_extension_array_equal(result, arr2d) + + result = arr2d[:, :] + self.assert_extension_array_equal(result, arr2d) + + result = arr2d[:, 0] + expected = data[[0]] + self.assert_extension_array_equal(result, expected) + + # dimension-expanding getitem on 1D + result = data[:, np.newaxis] + self.assert_extension_array_equal(result, arr2d.T) + + def test_iter_2d(self, data): + arr2d = data.reshape(1, -1) + + objs = list(iter(arr2d)) + assert len(objs) == arr2d.shape[0] + + for obj in objs: + assert isinstance(obj, type(data)) + assert obj.dtype == data.dtype + assert obj.ndim == 1 + assert len(obj) == arr2d.shape[1] + + def test_concat_2d(self, data): + left = data.reshape(-1, 1) + right = left.copy() + + # axis=0 + result = left._concat_same_type([left, right], axis=0) + expected = data._concat_same_type([data, data]).reshape(-1, 1) + self.assert_extension_array_equal(result, expected) + + # axis=1 + result = left._concat_same_type([left, right], axis=1) + expected = data.repeat(2).reshape(-1, 2) + self.assert_extension_array_equal(result, expected) + + # axis > 1 -> invalid + with pytest.raises(ValueError): + left._concat_same_type([left, right], axis=2) + + @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) + def test_reductions_2d_axis_none(self, data, method, request): + if not hasattr(data, method): + pytest.skip("test is not applicable for this type/dtype") + + arr2d = data.reshape(1, -1) + maybe_xfail_masked_reductions(arr2d, request) + + err_expected = None + err_result = None + try: + expected = getattr(data, method)() + except Exception as err: + # if the 1D reduction is invalid, the 2D reduction should be as well + err_expected = err + try: + result = getattr(arr2d, method)(axis=None) + except Exception as err2: + err_result = err2 + + else: + result = getattr(arr2d, method)(axis=None) + + if err_result is not None or err_expected is not None: + assert type(err_result) == type(err_expected) + return + + assert result == expected # TODO: or matching NA + + @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) + def test_reductions_2d_axis0(self, data, method, request): + if not hasattr(data, method): + pytest.skip("test is not applicable for this type/dtype") + + arr2d = data.reshape(1, -1) + maybe_xfail_masked_reductions(arr2d, request) + + kwargs = {} + if method == "std": + # pass ddof=0 so we get all-zero std instead of all-NA std + kwargs["ddof"] = 0 + + try: + result = getattr(arr2d, method)(axis=0, **kwargs) + except Exception as err: + try: + getattr(data, method)() + except Exception as err2: + assert type(err) == type(err2) + return + else: + raise AssertionError("Both reductions should raise or neither") + + if method in ["mean", "median", "sum", "prod"]: + # std and var are not dtype-preserving + expected = data + if method in ["sum", "prod"] and data.dtype.kind in ["i", "u"]: + # FIXME: kludge + if data.dtype.kind == "i": + dtype = pd.Int64Dtype + else: + dtype = pd.UInt64Dtype + + expected = data.astype(dtype) + if type(expected) != type(data): + mark = pytest.mark.xfail( + reason="IntegerArray.astype is broken GH#38983" + ) + request.node.add_marker(mark) + assert type(expected) == type(data), type(expected) + assert dtype == expected.dtype + + self.assert_extension_array_equal(result, expected) + elif method == "std": + self.assert_extension_array_equal(result, data - data) + # punt on method == "var" + + @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) + def test_reductions_2d_axis1(self, data, method, request): + if not hasattr(data, method): + pytest.skip("test is not applicable for this type/dtype") + + arr2d = data.reshape(1, -1) + maybe_xfail_masked_reductions(arr2d, request) + + try: + result = getattr(arr2d, method)(axis=1) + except Exception as err: + try: + getattr(data, method)() + except Exception as err2: + assert type(err) == type(err2) + return + else: + raise AssertionError("Both reductions should raise or neither") + + # not necesarrily type/dtype-preserving, so weaker assertions + assert result.shape == (1,) + expected_scalar = getattr(data, method)() + if pd.isna(result[0]): + # TODO: require matching NA + assert pd.isna(expected_scalar), expected_scalar + else: + assert result[0] == expected_scalar diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index 6c5963402b3d7..33589027c0d0f 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -235,3 +235,7 @@ class TestGroupby(BaseDatetimeTests, base.BaseGroupbyTests): class TestPrinting(BaseDatetimeTests, base.BasePrintingTests): pass + + +class Test2DCompat(BaseDatetimeTests, base.Dim2CompatTests): + pass diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 17f29e02a2883..ef6a6e6098a19 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -415,3 +415,7 @@ def test_setitem_loc_iloc_slice(self, data): @skip_nested class TestParsing(BaseNumPyTests, base.BaseParsingTests): pass + + +class Test2DCompat(BaseNumPyTests, base.Dim2CompatTests): + pass diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py index bbb991259ac29..4c845055b56c4 100644 --- a/pandas/tests/extension/test_period.py +++ b/pandas/tests/extension/test_period.py @@ -184,3 +184,7 @@ class TestParsing(BasePeriodTests, base.BaseParsingTests): @pytest.mark.parametrize("engine", ["c", "python"]) def test_EA_types(self, engine, data): super().test_EA_types(engine, data) + + +class Test2DCompat(BasePeriodTests, base.Dim2CompatTests): + pass From 44e371f04063e7b6cd6abba7a063cf4b01262f35 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 18 Feb 2021 20:14:39 -0800 Subject: [PATCH 05/61] REF: consolidate paths for astype --- pandas/core/arrays/datetimelike.py | 4 +++- pandas/core/arrays/datetimes.py | 4 ++++ pandas/core/indexes/extension.py | 4 ++++ pandas/core/internals/blocks.py | 11 +++++++++++ pandas/tests/dtypes/test_common.py | 2 +- pandas/tests/frame/methods/test_astype.py | 12 ++---------- .../tests/indexes/datetimes/methods/test_astype.py | 6 +++--- pandas/tests/indexes/period/methods/test_astype.py | 6 +++--- .../tests/indexes/timedeltas/methods/test_astype.py | 6 +++--- pandas/tests/series/test_constructors.py | 2 +- 10 files changed, 35 insertions(+), 22 deletions(-) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 5dd55ff0f1fa2..8e1d7e607fb8a 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -60,6 +60,7 @@ Substitution, cache_readonly, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_categorical_dtype, @@ -397,12 +398,13 @@ def astype(self, dtype, copy=True): elif is_integer_dtype(dtype): # we deliberately ignore int32 vs. int64 here. # See https://github.com/pandas-dev/pandas/issues/24381 for more. + level = find_stack_level() warnings.warn( f"casting {self.dtype} values to int64 with .astype(...) is " "deprecated and will raise in a future version. " "Use .view(...) instead.", FutureWarning, - stacklevel=3, + stacklevel=level, ) values = self.asi8 diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 05184ea02e7a2..3982a7deca2bb 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -616,6 +616,10 @@ def astype(self, dtype, copy=True): elif is_datetime64_ns_dtype(dtype): return astype_dt64_to_dt64tz(self, dtype, copy, via_utc=False) + elif self.tz is None and is_datetime64_dtype(dtype) and dtype != self.dtype: + # unit conversion e.g. datetime64[s] + return self._data.astype(dtype) + elif is_period_dtype(dtype): return self.to_period(freq=dtype.freq) return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy) diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 4150ec745bd2e..0097959245686 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -309,6 +309,10 @@ def astype(self, dtype, copy=True): return self return self.copy() + if isinstance(dtype, np.dtype) and dtype.kind == "M" and dtype != "M8[ns]": + # For now Datetime supports this by unwrapping ndarray, but DTI doesn't + raise TypeError(f"Cannot cast {type(self._data).__name__} to dtype") + new_values = self._data.astype(dtype, copy=copy) # pass copy=False because any copying will be done in the diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2eb5be01c932c..dec0e62d6d17f 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -673,6 +673,17 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): def _astype(self, dtype: DtypeObj, copy: bool) -> ArrayLike: values = self.values + if values.dtype.kind in ["m", "M"]: + values = self.array_values() + + if ( + values.dtype.kind in ["m", "M"] + and dtype.kind in ["i", "u"] + and dtype.itemsize != 8 + ): + # TODO(2.0) remove special case once deprecation on DTA/TDA is enforced + msg = rf"cannot astype a datetimelike from [{values.dtype}] to [{dtype}]" + raise TypeError(msg) if is_datetime64tz_dtype(dtype) and is_datetime64_dtype(values.dtype): return astype_dt64_to_dt64tz(values, dtype, copy, via_utc=True) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index a5522e503c7f4..2b689364c5002 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -724,7 +724,7 @@ def test_astype_nansafe(val, typ): msg = "Cannot convert NaT values to integer" with pytest.raises(ValueError, match=msg): - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): # datetimelike astype(int64) deprecated astype_nansafe(arr, dtype=typ) diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 46f5a20f38941..35e958ff3a2b1 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -432,19 +432,11 @@ def test_astype_to_incorrect_datetimelike(self, unit): other = f"m8[{unit}]" df = DataFrame(np.array([[1, 2, 3]], dtype=dtype)) - msg = ( - fr"cannot astype a datetimelike from \[datetime64\[ns\]\] to " - fr"\[timedelta64\[{unit}\]\]" - fr"|(Cannot cast DatetimeArray to dtype timedelta64\[{unit}\])" - ) + msg = fr"Cannot cast DatetimeArray to dtype timedelta64\[{unit}\]" with pytest.raises(TypeError, match=msg): df.astype(other) - msg = ( - fr"cannot astype a timedelta from \[timedelta64\[ns\]\] to " - fr"\[datetime64\[{unit}\]\]" - fr"|(Cannot cast TimedeltaArray to dtype datetime64\[{unit}\])" - ) + msg = fr"Cannot cast TimedeltaArray to dtype datetime64\[{unit}\]" df = DataFrame(np.array([[1, 2, 3]], dtype=other)) with pytest.raises(TypeError, match=msg): df.astype(dtype) diff --git a/pandas/tests/indexes/datetimes/methods/test_astype.py b/pandas/tests/indexes/datetimes/methods/test_astype.py index bed7cb9b54eba..8eb0e086ec3f7 100644 --- a/pandas/tests/indexes/datetimes/methods/test_astype.py +++ b/pandas/tests/indexes/datetimes/methods/test_astype.py @@ -29,7 +29,7 @@ def test_astype(self): ) tm.assert_index_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): result = idx.astype(int) expected = Int64Index( [1463356800000000000] + [-9223372036854775808] * 3, @@ -39,7 +39,7 @@ def test_astype(self): tm.assert_index_equal(result, expected) rng = date_range("1/1/2000", periods=10, name="idx") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): result = rng.astype("i8") tm.assert_index_equal(result, Index(rng.asi8, name="idx")) tm.assert_numpy_array_equal(result.values, rng.asi8) @@ -50,7 +50,7 @@ def test_astype_uint(self): np.array([946684800000000000, 946771200000000000], dtype="uint64"), name="idx", ) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): tm.assert_index_equal(arr.astype("uint64"), expected) tm.assert_index_equal(arr.astype("uint32"), expected) diff --git a/pandas/tests/indexes/period/methods/test_astype.py b/pandas/tests/indexes/period/methods/test_astype.py index 943b2605363c7..73439d349bebd 100644 --- a/pandas/tests/indexes/period/methods/test_astype.py +++ b/pandas/tests/indexes/period/methods/test_astype.py @@ -37,7 +37,7 @@ def test_astype_conversion(self): ) tm.assert_index_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): result = idx.astype(np.int64) expected = Int64Index( [16937] + [-9223372036854775808] * 3, dtype=np.int64, name="idx" @@ -49,7 +49,7 @@ def test_astype_conversion(self): tm.assert_index_equal(result, expected) idx = period_range("1990", "2009", freq="A", name="idx") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): result = idx.astype("i8") tm.assert_index_equal(result, Index(idx.asi8, name="idx")) tm.assert_numpy_array_equal(result.values, idx.asi8) @@ -57,7 +57,7 @@ def test_astype_conversion(self): def test_astype_uint(self): arr = period_range("2000", periods=2, name="idx") expected = UInt64Index(np.array([10957, 10958], dtype="uint64"), name="idx") - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): tm.assert_index_equal(arr.astype("uint64"), expected) tm.assert_index_equal(arr.astype("uint32"), expected) diff --git a/pandas/tests/indexes/timedeltas/methods/test_astype.py b/pandas/tests/indexes/timedeltas/methods/test_astype.py index a849ffa98324c..c2c7a1f32ae6e 100644 --- a/pandas/tests/indexes/timedeltas/methods/test_astype.py +++ b/pandas/tests/indexes/timedeltas/methods/test_astype.py @@ -55,7 +55,7 @@ def test_astype(self): ) tm.assert_index_equal(result, expected) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): result = idx.astype(int) expected = Int64Index( [100000000000000] + [-9223372036854775808] * 3, dtype=np.int64, name="idx" @@ -67,7 +67,7 @@ def test_astype(self): tm.assert_index_equal(result, expected) rng = timedelta_range("1 days", periods=10) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): result = rng.astype("i8") tm.assert_index_equal(result, Index(rng.asi8)) tm.assert_numpy_array_equal(rng.asi8, result.values) @@ -77,7 +77,7 @@ def test_astype_uint(self): expected = pd.UInt64Index( np.array([3600000000000, 90000000000000], dtype="uint64") ) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): tm.assert_index_equal(arr.astype("uint64"), expected) tm.assert_index_equal(arr.astype("uint32"), expected) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 1daeee8645f2e..6cd2a1dd180c1 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -1318,7 +1318,7 @@ def test_constructor_dtype_timedelta64(self): td.astype("int64") # invalid casting - msg = r"cannot astype a timedelta from \[timedelta64\[ns\]\] to \[int32\]" + msg = r"cannot astype a datetimelike from \[timedelta64\[ns\]\] to \[int32\]" with pytest.raises(TypeError, match=msg): td.astype("int32") From 29a19090b4743fd23f7a3d5bfddfd658c3cd74b9 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 19 Feb 2021 10:41:36 -0800 Subject: [PATCH 06/61] dont hardcode dt64tz --- pandas/core/internals/construction.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 6af8423eeaffd..711bb5d716e43 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -230,7 +230,8 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): return arrays_to_mgr(values, columns, index, columns, dtype=dtype) - if is_datetime64tz_dtype(values): + if is_extension_array_dtype(values) and not is_ea_dtype(values): + # i.e. Datetime64TZ # TODO: combine into _prep_ndarray? values = extract_array(values, extract_numpy=True) if copy: From c0aa860c32990b513bff89c6108b06f9217822e7 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 19 Feb 2021 18:18:52 -0800 Subject: [PATCH 07/61] just one contains_datetime check --- pandas/core/dtypes/concat.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 1e8f7f06ed24c..28cf3c9f9f5ac 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -114,6 +114,9 @@ def is_nonempty(x) -> bool: single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(is_extension_array_dtype(x.dtype) for x in to_concat) + if _contains_datetime: + return _concat_datetime(to_concat, axis=axis) + if any_ea: # we ignore axis here, as internally concatting with EAs is always # for axis=0 @@ -123,15 +126,10 @@ def is_nonempty(x) -> bool: if isinstance(to_concat[0], ExtensionArray): cls = type(to_concat[0]) - if _contains_datetime: - return _concat_datetime(to_concat, axis=axis) return cls._concat_same_type(to_concat) else: return np.concatenate(to_concat, axis=axis) - elif _contains_datetime: - return _concat_datetime(to_concat, axis=axis) - elif all_empty: # we have all empties, but may need to coerce the result dtype to # object if we have non-numeric type operands (numpy would otherwise From 3fc07a61fcba71079cdead98676d9d7cdd9da82d Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 19 Feb 2021 18:42:25 -0800 Subject: [PATCH 08/61] Simplify _maybe_coerce_values --- pandas/core/internals/blocks.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d69aa42c6f861..78e6aeac64ab7 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -99,7 +99,10 @@ ) from pandas.core.base import PandasObject import pandas.core.common as com -from pandas.core.construction import extract_array +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, +) from pandas.core.indexers import ( check_setitem_lengths, is_empty_indexer, @@ -2100,21 +2103,11 @@ def _maybe_coerce_values(cls, values): Returns ------- - values : ndarray[datetime64ns/timedelta64ns] - - Overridden by DatetimeTZBlock. + values : DatetimeArray or TimedeltaArray """ - if cls.fill_value is not NaT and values.dtype != cls._dtype: - # non-nano we will convert to nano - if values.dtype.kind != cls._dtype.kind: - # caller is responsible for ensuring td64/dt64 dtype - raise TypeError(values.dtype) # pragma: no cover - - values = cls._holder._from_sequence(values) - - if not isinstance(values, cls._holder): - values = cls._holder(values) - + values = extract_array(values, extract_numpy=True) + if isinstance(values, np.ndarray): + values = ensure_wrapped_if_datetimelike(values) return values def to_native_types(self, na_rep="NaT", **kwargs): From 093cf8b14186ba64b391a12d969453d87303ab4e Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 19 Feb 2021 20:33:07 -0800 Subject: [PATCH 09/61] remove _dtype --- pandas/core/internals/blocks.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 78e6aeac64ab7..5defecdb1d10e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2087,7 +2087,6 @@ class DatetimeLikeBlockMixin(NDArrayBackedExtensionBlock): is_numeric = False _can_hold_na = True - _dtype: np.dtype _holder: Type[Union[DatetimeArray, TimedeltaArray]] @classmethod @@ -2126,7 +2125,6 @@ class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () is_datetime = True fill_value = np.datetime64("NaT", "ns") - _dtype = fill_value.dtype _holder = DatetimeArray @@ -2163,7 +2161,6 @@ class TimeDeltaBlock(DatetimeLikeBlockMixin): __slots__ = () _holder = TimedeltaArray fill_value = np.timedelta64("NaT", "ns") - _dtype = fill_value.dtype class ObjectBlock(Block): From 1460ff34ade0511c691f4e10b950bd258bd37268 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 20 Feb 2021 09:54:45 -0800 Subject: [PATCH 10/61] remove _holder, fill_value --- pandas/core/internals/blocks.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5defecdb1d10e..c2d6e77c25a0c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -17,7 +17,6 @@ from pandas._libs import ( Interval, - NaT, Period, Timestamp, algos as libalgos, @@ -72,6 +71,7 @@ from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, isna, + na_value_for_dtype, ) import pandas.core.algorithms as algos @@ -2109,6 +2109,14 @@ def _maybe_coerce_values(cls, values): values = ensure_wrapped_if_datetimelike(values) return values + @property + def _holder(self): + return type(self.values) + + @property + def fill_value(self): + return na_value_for_dtype(self.dtype) + def to_native_types(self, na_rep="NaT", **kwargs): """ convert to our native types format """ arr = self.values @@ -2124,8 +2132,6 @@ def external_values(self): class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () is_datetime = True - fill_value = np.datetime64("NaT", "ns") - _holder = DatetimeArray class DatetimeTZBlock(DatetimeBlock, ExtensionBlock): @@ -2138,9 +2144,6 @@ class DatetimeTZBlock(DatetimeBlock, ExtensionBlock): _validate_ndim = True _can_consolidate = True - _holder = DatetimeArray - fill_value = NaT - get_values = Block.get_values set_inplace = Block.set_inplace iget = Block.iget @@ -2159,8 +2162,6 @@ class DatetimeTZBlock(DatetimeBlock, ExtensionBlock): class TimeDeltaBlock(DatetimeLikeBlockMixin): __slots__ = () - _holder = TimedeltaArray - fill_value = np.timedelta64("NaT", "ns") class ObjectBlock(Block): From f92f81f5ac6ce39ae9b21ddb1ec956b2ec55e3a0 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 22 Feb 2021 14:29:19 -0800 Subject: [PATCH 11/61] PERF: .array->._values --- pandas/core/series.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 34e9464006b30..93bbaaee1a541 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -803,7 +803,7 @@ def __array__(self, dtype: Optional[NpDtype] = None) -> np.ndarray: array(['1999-12-31T23:00:00.000000000', ...], dtype='datetime64[ns]') """ - return np.asarray(self.array, dtype) + return np.asarray(self._values, dtype) # ---------------------------------------------------------------------- # Unary Methods @@ -1797,7 +1797,7 @@ def count(self, level=None): 2 """ if level is None: - return notna(self.array).sum() + return notna(self._values).sum() elif not isinstance(self.index, MultiIndex): raise ValueError("Series.count level is only valid with a MultiIndex") @@ -2497,7 +2497,7 @@ def diff(self, periods: int = 1) -> Series: -------- {examples} """ - result = algorithms.diff(self.array, periods) + result = algorithms.diff(self._values, periods) return self._constructor(result, index=self.index).__finalize__( self, method="diff" ) @@ -3806,7 +3806,7 @@ def explode(self, ignore_index: bool = False) -> Series: if not len(self) or not is_object_dtype(self): return self.copy() - values, counts = reshape.explode(np.asarray(self.array)) + values, counts = reshape.explode(np.asarray(self._values)) if ignore_index: index = ibase.default_index(len(values)) From c0208bc927617d8c375a7061b48a4cd3207bbf59 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 22 Feb 2021 16:16:19 -0800 Subject: [PATCH 12/61] PERF: get_values --- pandas/core/internals/blocks.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d4a3bcf55f282..6a32ed40dc409 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -53,7 +53,6 @@ is_ea_dtype, is_extension_array_dtype, is_list_like, - is_object_dtype, is_sparse, is_strict_ea, pandas_dtype, @@ -117,6 +116,8 @@ Index, ) +_dtype_obj = np.dtype(object) # comparison is faster than is_object_dtype + class Block(PandasObject): """ @@ -278,9 +279,9 @@ def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: return an internal format, currently just the ndarray this is often overridden to handle to_dense like operations """ - if is_object_dtype(dtype): - return self.values.astype(object) - return np.asarray(self.values) + if dtype == _dtype_obj: + return self.values.astype(_dtype_obj) + return self.values @final def get_block_values_for_json(self) -> np.ndarray: @@ -2000,6 +2001,11 @@ class NDArrayBackedExtensionBlock(HybridMixin, Block): def array_values(self): return self.values + def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: + # We override instead of putting the np.asarray in Block.values for + # performance. + return np.asarray(Block.get_values(self, dtype)) + def putmask(self, mask, new) -> List[Block]: mask = extract_bool_array(mask) From 5081d278d56761208cb7fbe3e954e06b3ede1815 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Feb 2021 07:05:52 -0800 Subject: [PATCH 13/61] perf --- pandas/core/internals/blocks.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 6a32ed40dc409..71e7c9e153986 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -32,6 +32,7 @@ Shape, final, ) +from pandas.util._decorators import cache_readonly from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -378,7 +379,7 @@ def getitem_block(self, slicer, new_mgr_locs=None) -> Block: return type(self)._simple_new(new_values, new_mgr_locs, self.ndim) - @property + @property # TODO: for reasons unclear, caching shape here breaks 8 tests def shape(self) -> Shape: return self.values.shape @@ -1564,7 +1565,7 @@ def __init__(self, values, placement, ndim: int): if self.ndim == 2 and len(self.mgr_locs) > 1: raise ValueError("need to split... for now") - @property + @cache_readonly def shape(self) -> Shape: # TODO(EA2D): override unnecessary with 2D EAs if self.ndim == 1: @@ -2144,6 +2145,10 @@ def external_values(self): # since that return an object-dtype ndarray of Timestamps. return self.values._data + def get_block_values_for_json(self): + # Not necessary to override, but helps perf + return self.values._data + class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () @@ -2168,6 +2173,7 @@ class DatetimeTZBlock(DatetimeBlock, ExtensionBlock): __init__ = Block.__init__ take_nd = Block.take_nd _unstack = Block._unstack + get_block_values_for_json = Block.get_block_values_for_json # TODO: we still share these with ExtensionBlock (and not DatetimeBlock) # ['interpolate', 'quantile'] From 7b5a67ab79ae369fd7d5489ee26aca23de2bd151 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Feb 2021 14:32:11 -0800 Subject: [PATCH 14/61] CLN: comments --- pandas/core/internals/blocks.py | 2 +- pandas/core/internals/construction.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 71e7c9e153986..e327a5a181409 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -379,7 +379,7 @@ def getitem_block(self, slicer, new_mgr_locs=None) -> Block: return type(self)._simple_new(new_values, new_mgr_locs, self.ndim) - @property # TODO: for reasons unclear, caching shape here breaks 8 tests + @property def shape(self) -> Shape: return self.values.shape diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index d6a15cf348a61..4a59593945b6d 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -233,7 +233,6 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): if is_extension_array_dtype(values) and not is_ea_dtype(values): # i.e. Datetime64TZ - # TODO: combine into _prep_ndarray? values = extract_array(values, extract_numpy=True) if copy: values = values.copy() @@ -270,10 +269,7 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): if values.ndim == 2 and values.shape[0] != 1: # transpose and separate blocks - # TODO: do this in one go dvals_list = [maybe_infer_to_datetimelike(row) for row in values] - dvals_list = [extract_array(x, extract_numpy=True) for x in dvals_list] - # TODO: unpack DatetimeIndex directly in maybe_infer_to_datetimelike for n in range(len(dvals_list)): dvals_list[n] = dvals_list[n].reshape(1, -1) From be4a24385d3a10b32a601ccfcb84a061ce58ddc5 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Feb 2021 14:32:49 -0800 Subject: [PATCH 15/61] remove unnecessary extract_array --- pandas/core/internals/construction.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 4a59593945b6d..77c5076e4866b 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -284,7 +284,6 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool): else: datelike_vals = maybe_infer_to_datetimelike(values) block_values = [datelike_vals] - block_values = [extract_array(x, extract_numpy=True) for x in block_values] if values.ndim == 2: block_values = [ensure_block_shape(x, 2) for x in block_values] From 882373be472a0b725b9d37c646c61eb45c7d6419 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Feb 2021 17:28:47 -0800 Subject: [PATCH 16/61] dont override EA base class --- pandas/core/arrays/_mixins.py | 20 +++++++++++++++++++- pandas/core/arrays/base.py | 8 ++++---- pandas/core/internals/blocks.py | 2 -- 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 6c1f8049f6d20..bea307b10f81e 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -20,7 +20,10 @@ cache_readonly, doc, ) -from pandas.util._validators import validate_fillna_kwargs +from pandas.util._validators import ( + validate_bool_kwarg, + validate_fillna_kwargs, +) from pandas.core.dtypes.common import is_dtype_equal from pandas.core.dtypes.inference import is_array_like @@ -36,6 +39,7 @@ from pandas.core.arrays.base import ExtensionArray from pandas.core.construction import extract_array from pandas.core.indexers import check_array_indexer +from pandas.core.sorting import nargminmax NDArrayBackedExtensionArrayT = TypeVar( "NDArrayBackedExtensionArrayT", bound="NDArrayBackedExtensionArray" @@ -187,6 +191,20 @@ def equals(self, other) -> bool: def _values_for_argsort(self): return self._ndarray + def argmin(self, axis: int = 0, skipna: bool = True): + # override base class by adding axis keyword + validate_bool_kwarg(skipna, "skipna") + if not skipna and self.isna().any(): + raise NotImplementedError + return nargminmax(self, "argmin", axis=axis) + + def argmax(self, axis: int = 0, skipna: bool = True): + # override base class by adding axis keyword + validate_bool_kwarg(skipna, "skipna") + if not skipna and self.isna().any(): + raise NotImplementedError + return nargminmax(self, "argmax", axis=axis) + def copy(self: NDArrayBackedExtensionArrayT) -> NDArrayBackedExtensionArrayT: new_data = self._ndarray.copy() return self._from_backing_data(new_data) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 36ec6b61de9ac..edc8fa14ca142 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -618,7 +618,7 @@ def argsort( mask=np.asarray(self.isna()), ) - def argmin(self, axis: int = 0, skipna: bool = True): + def argmin(self, skipna: bool = True) -> int: """ Return the index of minimum value. @@ -640,9 +640,9 @@ def argmin(self, axis: int = 0, skipna: bool = True): validate_bool_kwarg(skipna, "skipna") if not skipna and self.isna().any(): raise NotImplementedError - return nargminmax(self, "argmin", axis=axis) + return nargminmax(self, "argmin") - def argmax(self, axis: int = 0, skipna: bool = True): + def argmax(self, skipna: bool = True) -> int: """ Return the index of maximum value. @@ -664,7 +664,7 @@ def argmax(self, axis: int = 0, skipna: bool = True): validate_bool_kwarg(skipna, "skipna") if not skipna and self.isna().any(): raise NotImplementedError - return nargminmax(self, "argmax", axis=axis) + return nargminmax(self, "argmax") def fillna(self, value=None, method=None, limit=None): """ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e327a5a181409..54e5c6abe09db 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2260,8 +2260,6 @@ def f(mask, val, idx): if isinstance(values, np.ndarray): # TODO(EA2D): allow EA once reshape is supported values = values.reshape(shape) - if isinstance(values, ABCIndex): - values = values._data return values if self.ndim == 2: From 5cca7fb3f75255ce1f80004291ac0a86e7838568 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 23 Feb 2021 18:18:46 -0800 Subject: [PATCH 17/61] revert perf workarounds --- pandas/core/internals/blocks.py | 3 +-- pandas/core/series.py | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 8d41cfa9c8a2e..202e5564bdb00 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -32,7 +32,6 @@ Shape, final, ) -from pandas.util._decorators import cache_readonly from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.cast import ( @@ -1563,7 +1562,7 @@ def __init__(self, values, placement, ndim: int): if self.ndim == 2 and len(self.mgr_locs) > 1: raise ValueError("need to split... for now") - @cache_readonly + @property def shape(self) -> Shape: # TODO(EA2D): override unnecessary with 2D EAs if self.ndim == 1: diff --git a/pandas/core/series.py b/pandas/core/series.py index 93bbaaee1a541..34e9464006b30 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -803,7 +803,7 @@ def __array__(self, dtype: Optional[NpDtype] = None) -> np.ndarray: array(['1999-12-31T23:00:00.000000000', ...], dtype='datetime64[ns]') """ - return np.asarray(self._values, dtype) + return np.asarray(self.array, dtype) # ---------------------------------------------------------------------- # Unary Methods @@ -1797,7 +1797,7 @@ def count(self, level=None): 2 """ if level is None: - return notna(self._values).sum() + return notna(self.array).sum() elif not isinstance(self.index, MultiIndex): raise ValueError("Series.count level is only valid with a MultiIndex") @@ -2497,7 +2497,7 @@ def diff(self, periods: int = 1) -> Series: -------- {examples} """ - result = algorithms.diff(self._values, periods) + result = algorithms.diff(self.array, periods) return self._constructor(result, index=self.index).__finalize__( self, method="diff" ) @@ -3806,7 +3806,7 @@ def explode(self, ignore_index: bool = False) -> Series: if not len(self) or not is_object_dtype(self): return self.copy() - values, counts = reshape.explode(np.asarray(self._values)) + values, counts = reshape.explode(np.asarray(self.array)) if ignore_index: index = ibase.default_index(len(values)) From b7abcd4c322c97c00df8b72ac38251d05f37b800 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 25 Feb 2021 18:40:56 -0800 Subject: [PATCH 18/61] NIE instead of assert false --- pandas/core/sorting.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index aa7e9b400f475..3edf79c4ad2eb 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -419,9 +419,10 @@ def nargminmax(values, method: str, axis: int = 0): idx = np.arange(values.shape[axis]) if values.ndim > 1 and values.size > 0: - # values.size check is a kludge bc JSONArray can come back with size-0 2D + # FIXME: values.size check is a kludge bc JSONArray can come + # back with size-0 2D if mask.any(): - assert False # just checking + raise NotImplementedError return func(values, axis=axis) non_nans = values[~mask] From 10a78b91c56dd7c61e4763516e821b4c006df1db Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 1 Mar 2021 09:59:37 -0800 Subject: [PATCH 19/61] fix simple_new usage --- pandas/core/nanops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a0dfb1c83a70b..3b5c0aad6b0fe 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -1739,7 +1739,7 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: # restore NaT elements y[mask] = iNaT # TODO: could try/finally for this? - if isinstance(values, np.ndarray): + if isinstance(values.dtype, np.dtype): result = result.view(orig_dtype) else: # DatetimeArray From 5fac38e17d763697c8cfbac04033040387f9a15e Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 1 Mar 2021 15:12:38 -0800 Subject: [PATCH 20/61] mypy fixup --- pandas/core/array_algos/take.py | 10 +++++++++- pandas/core/arrays/_mixins.py | 6 ++++-- pandas/core/frame.py | 3 ++- pandas/core/internals/blocks.py | 33 +++++++++++++++++++++++-------- pandas/core/internals/managers.py | 4 +++- 5 files changed, 43 insertions(+), 13 deletions(-) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index ae5b4d927b091..f72f645e61de2 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -1,6 +1,10 @@ from __future__ import annotations -from typing import Optional +from typing import ( + TYPE_CHECKING, + Optional, + cast, +) import numpy as np @@ -20,6 +24,9 @@ from pandas.core.construction import ensure_wrapped_if_datetimelike +if TYPE_CHECKING: + from pandas.core.arrays._mixins import NDArrayBackedExtensionArray + def take_nd( arr: ArrayLike, @@ -69,6 +76,7 @@ def take_nd( # includes for EA to catch DatetimeArray, TimedeltaArray if not is_strict_ea(arr): # i.e. DatetimeArray, TimedeltaArray + arr = cast("NDArrayBackedExtensionArray", arr) return arr.take( indexer, axis=axis, fill_value=fill_value, allow_fill=allow_fill ) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index d1a1d32262d92..f236b60248271 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -186,14 +186,16 @@ def equals(self, other) -> bool: def _values_for_argsort(self): return self._ndarray - def argmin(self, axis: int = 0, skipna: bool = True): + # Signature of "argmin" incompatible with supertype "ExtensionArray" + def argmin(self, axis: int = 0, skipna: bool = True): # type:ignore[override] # override base class by adding axis keyword validate_bool_kwarg(skipna, "skipna") if not skipna and self.isna().any(): raise NotImplementedError return nargminmax(self, "argmin", axis=axis) - def argmax(self, axis: int = 0, skipna: bool = True): + # Signature of "argmax" incompatible with supertype "ExtensionArray" + def argmax(self, axis: int = 0, skipna: bool = True): # type:ignore[override] # override base class by adding axis keyword validate_bool_kwarg(skipna, "skipna") if not skipna and self.isna().any(): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c4247fdb0f5cc..3e1c1e2fff656 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3190,7 +3190,8 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: if self._can_fast_transpose: # Note: tests pass without this, but this improves perf quite a bit. - new_values = self._values_compat.T + # error: "ArrayLike" has no attribute "T" + new_values = self._values_compat.T # type:ignore[attr-defined] if copy: new_values = new_values.copy() diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 37140dbde151d..3f08b2edcd5b1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -95,6 +95,7 @@ DatetimeArray, ExtensionArray, PandasArray, + TimedeltaArray, ) from pandas.core.base import PandasObject import pandas.core.common as com @@ -115,6 +116,8 @@ Float64Index, Index, ) + from pandas.core.arrays._mixins import NDArrayBackedExtensionArray + _dtype_obj = np.dtype(object) # comparison is faster than is_object_dtype @@ -283,7 +286,6 @@ def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: return self.values.astype(_dtype_obj) return self.values - @final def get_block_values_for_json(self) -> np.ndarray: """ This is used in the JSON C code. @@ -401,7 +403,6 @@ def set_inplace(self, locs, values): """ self.values[locs] = values - @final def delete(self, loc) -> None: """ Delete given loc(-s) from block in-place. @@ -1999,9 +2000,11 @@ def to_native_types( class NDArrayBackedExtensionBlock(HybridMixin, Block): """ - Block backed by an NDarrayBackedExtensionArray, supporting 2D values. + Block backed by an NDArrayBackedExtensionArray, supporting 2D values. """ + values: NDArrayBackedExtensionArray + def array_values(self): return self.values @@ -2035,7 +2038,7 @@ def where(self, other, cond, errors="raise", axis: int = 0) -> List[Block]: def is_view(self) -> bool: """ return a boolean if I am possibly a view """ # check the ndarray values of the DatetimeIndex values - return self.values._data.base is not None + return self.values._ndarray.base is not None def setitem(self, indexer, value): if not self._can_hold_element(value): @@ -2106,6 +2109,8 @@ def fillna( class DatetimeLikeBlockMixin(NDArrayBackedExtensionBlock): """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" + values: Union[DatetimeArray, TimedeltaArray] + is_numeric = False _can_hold_na = True @@ -2146,11 +2151,11 @@ def to_native_types(self, na_rep="NaT", **kwargs): def external_values(self): # NB: for dt64tz this is different from np.asarray(self.values), # since that return an object-dtype ndarray of Timestamps. - return self.values._data + return self.values._ndarray def get_block_values_for_json(self): # Not necessary to override, but helps perf - return self.values._data + return self.values._ndarray class DatetimeBlock(DatetimeLikeBlockMixin): @@ -2168,13 +2173,24 @@ class DatetimeTZBlock(DatetimeBlock, ExtensionBlock): _validate_ndim = True _can_consolidate = True + to_native_types = DatetimeBlock.to_native_types # needed for mypy + get_values = Block.get_values set_inplace = Block.set_inplace iget = Block.iget _slice = Block._slice - shape = Block.shape + # Incompatible types in assignment (expression has type + # "Callable[[Block], Tuple[int, ...]]", base class "ExtensionBlock" + # defined the type as "Tuple[int, ...]") + shape = Block.shape # type:ignore[assignment] __init__ = Block.__init__ - take_nd = Block.take_nd + # Incompatible types in assignment (expression has type + # "Callable[[Arg(Any, 'indexer'), Arg(int, 'axis'), + # DefaultArg(Any, 'new_mgr_locs'), DefaultArg(Any, 'fill_value')], Block]", + # base class "ExtensionBlock" defined the type as + # "Callable[[Arg(Any, 'indexer'), DefaultArg(int, 'axis'), + # DefaultArg(Any, 'new_mgr_locs'), DefaultArg(Any, 'fill_value')], Block]") + take_nd = Block.take_nd # type:ignore[assignment] _unstack = Block._unstack get_block_values_for_json = Block.get_block_values_for_json @@ -2431,5 +2447,6 @@ def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. values = extract_array(values, extract_numpy=True) + values = cast(Union[np.ndarray, "NDArrayBackedExtensionArray"], values) values = values.reshape(1, -1) return values diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 99581cd1fbdc4..fe3b03142fb93 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1936,8 +1936,10 @@ def _merge_blocks( if isinstance(blocks[0].dtype, np.dtype): new_values = np.vstack([b.values for b in blocks]) else: + # Unexpected keyword argument "axis" for "_concat_same_type" + # of "ExtensionArray" new_values = blocks[0].values._concat_same_type( - [b.values for b in blocks], axis=0 + [b.values for b in blocks], axis=0 # type:ignore[call-arg] ) argsort = np.argsort(new_mgr_locs) From 7bb85b674dac18e02626889b21a3ce7cec86ef6d Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 1 Mar 2021 20:12:33 -0800 Subject: [PATCH 21/61] Fix test on older numpys --- pandas/core/internals/construction.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 1da0c2769684c..469488c37421a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -61,6 +61,7 @@ from pandas.core.arrays import ( Categorical, ExtensionArray, + TimedeltaArray, ) from pandas.core.construction import ( extract_array, @@ -412,6 +413,11 @@ def treat_as_nested(data) -> bool: def _prep_ndarray(values, copy: bool = True) -> np.ndarray: + if isinstance(values, TimedeltaArray): + # On older numpy, np.asarray below apparently does not call __array__, + # so nanoseconds get dropped. + values = values._ndarray + if not isinstance(values, (np.ndarray, ABCSeries, Index)): if len(values) == 0: return np.empty((0, 0), dtype=object) From 399d7222a65f807efe4e2f621a03279df98f6924 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Mar 2021 07:26:39 -0800 Subject: [PATCH 22/61] fastparquet compat --- pandas/core/internals/blocks.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3f08b2edcd5b1..91d06b47abe5e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -169,6 +169,16 @@ def __init__(self, values, placement, ndim: int): self.values = self._maybe_coerce_values(values) if self._validate_ndim and self.ndim and len(self.mgr_locs) != len(self.values): + if ( + is_datetime64tz_dtype(values.dtype) + and self.ndim == 2 + and values.ndim == 1 + and len(self.mgr_locs) == 1 + ): + # Passed by fastparquet; TODO: warn? + self.values = self.values.reshape(1, -1) + return + raise ValueError( f"Wrong number of items passed {len(self.values)}, " f"placement implies {len(self.mgr_locs)}" @@ -215,10 +225,14 @@ def _check_ndim(self, values, ndim): if self._validate_ndim: if values.ndim != ndim: - raise ValueError( - "Wrong number of dimensions. " - f"values.ndim != ndim [{values.ndim} != {ndim}]" - ) + if is_datetime64tz_dtype(values.dtype): + # passed via fastparquet; TODO: warn? + pass + else: + raise ValueError( + "Wrong number of dimensions. " + f"values.ndim != ndim [{values.ndim} != {ndim}]" + ) elif values.ndim > ndim: # ExtensionBlock raise ValueError( From dff0fec2fef64928cc684e8743a7428a38e5a058 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Mar 2021 08:10:54 -0800 Subject: [PATCH 23/61] troubleshoot --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 91d06b47abe5e..32b05b1077c1a 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -225,7 +225,7 @@ def _check_ndim(self, values, ndim): if self._validate_ndim: if values.ndim != ndim: - if is_datetime64tz_dtype(values.dtype): + if is_datetime64tz_dtype(values.dtype) and values.ndim < ndim: # passed via fastparquet; TODO: warn? pass else: From b9d823134c58bde0ea255e271ddb3b1fd856a63f Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Mar 2021 09:07:20 -0800 Subject: [PATCH 24/61] array manager tests --- pandas/core/internals/array_manager.py | 6 ++++++ pandas/tests/frame/methods/test_append.py | 1 + pandas/tests/frame/methods/test_infer_objects.py | 3 +++ pandas/tests/frame/methods/test_replace.py | 5 +++++ pandas/tests/frame/methods/test_set_index.py | 6 ++++-- 5 files changed, 19 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 5001754017dda..fc019de6d16c3 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -33,6 +33,7 @@ ) from pandas.core.dtypes.common import ( is_bool_dtype, + is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, is_numeric_dtype, @@ -109,6 +110,11 @@ def __init__( # Note: we are storing the axes in "_axes" in the (row, columns) order # which contrasts the order how it is stored in BlockManager self._axes = axes + + for i, arr in enumerate(arrays): + if is_datetime64tz_dtype(arr.dtype) and arr.ndim == 2: + assert arr.shape[0] == 1 + arrays[i] = arr[0] self.arrays = arrays if verify_integrity: diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index ba58d88fb4863..0293dc63015cd 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -200,6 +200,7 @@ def test_append_dtypes(self): @pytest.mark.parametrize( "timestamp", ["2019-07-19 07:04:57+0100", "2019-07-19 07:04:57"] ) + @td.skip_array_manager_not_yet_implemented def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): # GH 30238 tz = tz_naive_fixture diff --git a/pandas/tests/frame/methods/test_infer_objects.py b/pandas/tests/frame/methods/test_infer_objects.py index a824a615b5c29..55020081f50f6 100644 --- a/pandas/tests/frame/methods/test_infer_objects.py +++ b/pandas/tests/frame/methods/test_infer_objects.py @@ -1,10 +1,13 @@ from datetime import datetime +import pandas.util._test_decorators as td + from pandas import DataFrame import pandas._testing as tm class TestInferObjects: + @td.skip_array_manager_not_yet_implemented def test_infer_objects(self): # GH#11221 df = DataFrame( diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 6d1e90e2f9646..0edf500568e28 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -10,6 +10,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -661,6 +663,7 @@ def test_replace_regex_metachar(self, metachar): expected = DataFrame({"a": ["paren", "else"]}) tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented def test_replace(self, datetime_frame): datetime_frame["A"][:5] = np.nan datetime_frame["A"][-5:] = np.nan @@ -1024,6 +1027,7 @@ def test_replace_for_new_dtypes(self, datetime_frame): ), ], ) + @td.skip_array_manager_not_yet_implemented def test_replace_dtypes(self, frame, to_replace, value, expected): result = getattr(frame, "replace")(to_replace, value) tm.assert_frame_equal(result, expected) @@ -1492,6 +1496,7 @@ def test_replace_commutative(self, df, to_replace, exp): np.float64(1), ], ) + @td.skip_array_manager_not_yet_implemented def test_replace_replacer_dtype(self, replacer): # GH26632 df = DataFrame(["a"]) diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 08ffb9dea36e0..62dc400f8de9f 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -96,7 +96,7 @@ def test_set_index_cast_datetimeindex(self): idf = df.set_index("A") assert isinstance(idf.index, DatetimeIndex) - def test_set_index_dst(self): + def test_set_index_dst(self, using_array_manager): di = date_range("2006-10-29 00:00:00", periods=3, freq="H", tz="US/Pacific") df = DataFrame(data={"a": [0, 1, 2], "b": [3, 4, 5]}, index=di).reset_index() @@ -104,8 +104,10 @@ def test_set_index_dst(self): res = df.set_index("index") exp = DataFrame( data={"a": [0, 1, 2], "b": [3, 4, 5]}, - index=Index(di, name="index")._with_freq(None), + index=Index(di, name="index"), ) + if not using_array_manager: + exp.index = exp.index._with_freq(None) tm.assert_frame_equal(res, exp) # GH#12920 From 3469bb51a408d0e56b168c46067d8577d4a6c088 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Mar 2021 09:58:37 -0800 Subject: [PATCH 25/61] array manager tests --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 3e1c1e2fff656..b9fabdfe1dfe9 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9224,7 +9224,7 @@ def func(values: np.ndarray): def blk_func(values, axis=1): if isinstance(values, ExtensionArray): - if not is_strict_ea(values): + if not is_strict_ea(values) and not isinstance(self._mgr, ArrayManager): return values._reduce(name, axis=1, skipna=skipna, **kwds) return values._reduce(name, skipna=skipna, **kwds) else: From 9602a743f62fe57d0f4aef183af62dbb301a68ad Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Mar 2021 14:13:34 -0800 Subject: [PATCH 26/61] array-manager tests --- pandas/core/internals/array_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index d3eb33e2dd775..4610d99464b07 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -433,7 +433,7 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T: if isinstance(applied, list): applied = applied[0] arr = applied.values - if isinstance(arr, np.ndarray): + if arr.ndim == 2: arr = arr[0, :] result_arrays.append(arr) From 33d8a24da23aa809e946aea64759e3ca4ac769ea Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 Mar 2021 15:00:40 -0800 Subject: [PATCH 27/61] array-manager test --- pandas/tests/groupby/test_groupby.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c61970929e237..a5f512082e761 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -823,7 +823,7 @@ def test_groupby_multi_corner(df): tm.assert_frame_equal(agged, expected) -def test_omit_nuisance(df): +def test_omit_nuisance(df, using_array_manager): grouped = df.groupby("A") result = grouped.mean() @@ -844,6 +844,8 @@ def test_omit_nuisance(df): # won't work with axis = 1 grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) msg = "'DatetimeArray' does not implement reduction 'sum'" + if using_array_manager: + msg = "reduction operation 'sum' not allowed for this dtype" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) From 2a0f7dea0f07c81ec0f680d8e583005a3cf5ee42 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 Mar 2021 14:29:10 -0800 Subject: [PATCH 28/61] troubleshoot docbuild --- pandas/core/internals/concat.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index cdaabf3448bb9..826b127200643 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -39,10 +39,7 @@ ) import pandas.core.algorithms as algos -from pandas.core.arrays import ( - DatetimeArray, - ExtensionArray, -) +from pandas.core.arrays import DatetimeArray from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.blocks import make_block from pandas.core.internals.managers import BlockManager @@ -423,7 +420,7 @@ def _concatenate_join_units( else: concat_values = concat_values.copy() - elif any(isinstance(t, ExtensionArray) for t in to_concat): + elif any(is_strict_ea(t) for t in to_concat): # TODO(EA2D): special case not needed if all EAs used HybridBlocks # NB: we are still assuming here that Hybrid blocks have shape (1, N) # concatting with at least one EA means we are concatting a single column From 1019aad0c5de74ff334993091bfab8531cf53bc4 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Mar 2021 19:25:44 -0800 Subject: [PATCH 29/61] cleanups --- pandas/core/dtypes/concat.py | 2 +- pandas/core/frame.py | 11 ++++++----- pandas/core/groupby/generic.py | 4 +--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 9fe96def743de..3a05b4698d6a0 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -128,7 +128,7 @@ def is_nonempty(x) -> bool: cls = type(to_concat[0]) return cls._concat_same_type(to_concat) else: - return np.concatenate(to_concat, axis=axis) + return np.concatenate(to_concat) elif all_empty: # we have all empties, but may need to coerce the result dtype to diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 259d99d8011d0..30d5ee2d8f76f 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -215,6 +215,10 @@ TimestampConvertibleTypes, ) + from pandas.core.arrays import ( + DatetimeArray, + TimedeltaArray, + ) from pandas.core.groupby.generic import DataFrameGroupBy from pandas.core.resample import Resampler @@ -784,7 +788,7 @@ def _can_fast_transpose(self) -> bool: return not is_ea_dtype(dtype) @property - def _values_compat(self) -> ArrayLike: + def _values_compat(self) -> Union[np.ndarray, DatetimeArray, TimedeltaArray]: """ Analogue to ._values that may return a 2D ExtensionArray. """ @@ -3211,8 +3215,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: if self._can_fast_transpose: # Note: tests pass without this, but this improves perf quite a bit. - # error: "ArrayLike" has no attribute "T" - new_values = self._values_compat.T # type:ignore[attr-defined] + new_values = self._values_compat.T if copy: new_values = new_values.copy() @@ -3225,7 +3228,6 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: ): # We have EAs with the same dtype. We can preserve that dtype in transpose. dtype = dtypes[0] - arr_type = dtype.construct_array_type() values = self.values @@ -3233,7 +3235,6 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: result = self._constructor( dict(zip(self.index, new_values)), index=self.columns ) - # TODO: what if index is non-unique? (not specific to EA2D) else: new_values = self.values.T diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8d8fc3ad5b036..64763225d61a9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -64,7 +64,6 @@ is_interval_dtype, is_numeric_dtype, is_scalar, - is_strict_ea, needs_i8_conversion, ) from pandas.core.dtypes.missing import ( @@ -1123,8 +1122,7 @@ def py_fallback(values: ArrayLike) -> ArrayLike: obj: FrameOrSeriesUnion # call our grouper again with only this block - if is_strict_ea(values) or values.ndim == 1: - # TODO(EA2D): special case not needed with 2D EAs + if values.ndim == 1: obj = Series(values) else: # TODO special case not needed with ArrayManager From 9faf9abab83324708dec88381d883b94ea387e8f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Mar 2021 11:53:14 -0800 Subject: [PATCH 30/61] update for quantile --- pandas/core/array_algos/quantile.py | 7 ++++++- pandas/core/internals/api.py | 19 ++++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 802fc4db0a36d..7d474a72d13d6 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -37,7 +37,8 @@ def quantile_compat(values: ArrayLike, qs, interpolation: str, axis: int) -> Arr ------- np.ndarray or ExtensionArray """ - if isinstance(values, np.ndarray): + if isinstance(values.dtype, np.dtype): + # i.e. np.ndarray, DatetimeArray, TimedeltaArray fill_value = na_value_for_dtype(values.dtype, compat=False) mask = isna(values) result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis) @@ -156,6 +157,10 @@ def quantile_ea_compat( assert result.shape == (1,), result.shape result = type(orig)._from_factorized(result, orig) + elif orig.ndim == 2: + # i.e. DatetimeArray + result = type(orig)._from_factorized(result, orig) + else: assert result.shape == (1, len(qs)), result.shape result = type(orig)._from_factorized(result[0], orig) diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index 3fbe324417c60..4af54e73790e2 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -10,9 +10,13 @@ import numpy as np +from pandas._libs.internals import BlockPlacement from pandas._typing import Dtype -from pandas.core.dtypes.common import is_datetime64tz_dtype +from pandas.core.dtypes.common import ( + is_datetime64tz_dtype, + is_extension_array_dtype, +) from pandas.core.dtypes.dtypes import PandasDtype from pandas.core.dtypes.generic import ABCPandasArray @@ -20,6 +24,7 @@ from pandas.core.internals.blocks import ( Block, DatetimeTZBlock, + ensure_block_shape, get_block_type, ) @@ -58,4 +63,16 @@ def make_block( # for e.g. pyarrow? values = DatetimeArray._simple_new(values, dtype=dtype) + if not isinstance(placement, BlockPlacement): + placement = BlockPlacement(placement) + + if is_extension_array_dtype(values.dtype): + if ndim is None: + if len(placement) != 1: + ndim = 1 + else: + ndim = 2 + + values = ensure_block_shape(values, ndim) + return klass(values, ndim=ndim, placement=placement) From 3e5ca3b9794d8075ca053c364462de6c8814fcf2 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Mar 2021 14:48:41 -0800 Subject: [PATCH 31/61] fix array-manager quantile tests --- pandas/core/internals/array_manager.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 26920a7b7c154..65a9bda1aa448 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -41,6 +41,7 @@ is_extension_array_dtype, is_numeric_dtype, is_object_dtype, + is_strict_ea, is_timedelta64_ns_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -494,10 +495,7 @@ def quantile( interpolation="linear", ) -> ArrayManager: - arrs = [ - x if not isinstance(x, np.ndarray) else np.atleast_2d(x) - for x in self.arrays - ] + arrs = [x if is_strict_ea(x) else x.reshape(1, -1) for x in self.arrays] assert axis == 1 new_arrs = [quantile_compat(x, qs, interpolation, axis=axis) for x in arrs] for i, arr in enumerate(new_arrs): From 7916fffd960e57ba0e34d351601f41720c0fb240 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Mar 2021 15:52:31 -0800 Subject: [PATCH 32/61] better names for dtype checks, un-skip array-manager tests --- pandas/core/array_algos/take.py | 4 ++-- pandas/core/dtypes/common.py | 4 ++-- pandas/core/frame.py | 10 ++++++---- pandas/core/internals/array_manager.py | 20 +++++++++---------- pandas/core/internals/blocks.py | 8 ++++---- pandas/core/internals/concat.py | 14 ++++++------- pandas/core/internals/construction.py | 6 +++--- pandas/core/internals/managers.py | 4 ++-- pandas/core/reshape/reshape.py | 4 ++-- pandas/tests/frame/methods/test_append.py | 1 - .../tests/frame/methods/test_infer_objects.py | 3 --- pandas/tests/frame/methods/test_replace.py | 5 ----- 12 files changed, 38 insertions(+), 45 deletions(-) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index 50317e42ab8b8..b9e1a41260dce 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -19,7 +19,7 @@ from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, - is_strict_ea, + is_1d_only_ea_obj, ) from pandas.core.dtypes.missing import na_value_for_dtype @@ -75,7 +75,7 @@ def take_nd( if not isinstance(arr, np.ndarray): # i.e. ExtensionArray, # includes for EA to catch DatetimeArray, TimedeltaArray - if not is_strict_ea(arr): + if not is_1d_only_ea_obj(arr): # i.e. DatetimeArray, TimedeltaArray arr = cast("NDArrayBackedExtensionArray", arr) return arr.take( diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 48a3d382e6039..15e5779eec6c5 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1495,7 +1495,7 @@ def is_extension_type(arr) -> bool: return False -def is_strict_ea(obj): +def is_1d_only_ea_obj(obj) -> bool: """ ExtensionArray that does not support 2D, or more specifically that does not use HybridBlock. @@ -1511,7 +1511,7 @@ def is_strict_ea(obj): ) -def is_ea_dtype(dtype) -> bool: +def is_1d_only_ea_dtype(dtype) -> bool: """ Analogue to is_extension_array_dtype but excluding DatetimeTZDtype. """ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7a7fa2f4395de..3bf112424c252 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -105,12 +105,13 @@ ensure_int64, ensure_platform_int, infer_dtype_from_object, + is_1d_only_ea_dtype, + is_1d_only_ea_obj, is_bool_dtype, is_dataclass, is_datetime64_any_dtype, is_dict_like, is_dtype_equal, - is_ea_dtype, is_extension_array_dtype, is_float, is_float_dtype, @@ -122,7 +123,6 @@ is_object_dtype, is_scalar, is_sequence, - is_strict_ea, pandas_dtype, ) from pandas.core.dtypes.missing import ( @@ -785,7 +785,7 @@ def _can_fast_transpose(self) -> bool: dtype = blocks[0].dtype # TODO(EA2D) special case would be unnecessary with 2D EAs - return not is_ea_dtype(dtype) + return not is_1d_only_ea_dtype(dtype) @property def _values_compat(self) -> Union[np.ndarray, DatetimeArray, TimedeltaArray]: @@ -9266,7 +9266,9 @@ def func(values: np.ndarray): def blk_func(values, axis=1): if isinstance(values, ExtensionArray): - if not is_strict_ea(values) and not isinstance(self._mgr, ArrayManager): + if not is_1d_only_ea_obj(values) and not isinstance( + self._mgr, ArrayManager + ): return values._reduce(name, axis=1, skipna=skipna, **kwds) return values._reduce(name, skipna=skipna, **kwds) else: diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 65a9bda1aa448..9b244303acd67 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -34,14 +34,13 @@ soft_convert_objects, ) from pandas.core.dtypes.common import ( + is_1d_only_ea_obj, is_bool_dtype, is_datetime64_ns_dtype, - is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, is_numeric_dtype, is_object_dtype, - is_strict_ea, is_timedelta64_ns_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -131,10 +130,6 @@ def __init__( # which contrasts the order how it is stored in BlockManager self._axes = axes - for i, arr in enumerate(arrays): - if is_datetime64tz_dtype(arr.dtype) and arr.ndim == 2: - assert arr.shape[0] == 1 - arrays[i] = arr[0] self.arrays = arrays if verify_integrity: @@ -479,9 +474,9 @@ def apply_with_block(self: T, f, align_keys=None, swap_axis=True, **kwargs) -> T if isinstance(applied, list): applied = applied[0] arr = applied.values - if self.ndim == 2: - if isinstance(arr, np.ndarray): - arr = arr[0, :] + if self.ndim == 2 and arr.ndim == 2: + # 2D for np.ndarray or DatetimeArray/TimedeltaArray + arr = arr[0, :] result_arrays.append(arr) return type(self)(result_arrays, self._axes) @@ -495,7 +490,12 @@ def quantile( interpolation="linear", ) -> ArrayManager: - arrs = [x if is_strict_ea(x) else x.reshape(1, -1) for x in self.arrays] + # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" + # has no attribute "reshape" + arrs = [ + x if is_1d_only_ea_obj(x) else x.reshape(1, -1) # type:ignore[union-attr] + for x in self.arrays + ] assert axis == 1 new_arrs = [quantile_compat(x, qs, interpolation, axis=axis) for x in arrs] for i, arr in enumerate(new_arrs): diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7c4d6d9d678d5..e9785baed4566 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -44,14 +44,14 @@ soft_convert_objects, ) from pandas.core.dtypes.common import ( + is_1d_only_ea_dtype, + is_1d_only_ea_obj, is_categorical_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_ea_dtype, is_extension_array_dtype, is_list_like, is_sparse, - is_strict_ea, pandas_dtype, ) from pandas.core.dtypes.dtypes import ( @@ -463,7 +463,7 @@ def _split_op_result(self, result) -> List[Block]: # if we get a 2D ExtensionArray, we need to split it into 1D pieces nbs = [] for i, loc in enumerate(self.mgr_locs): - if not is_strict_ea(result): + if not is_1d_only_ea_obj(result): vals = result[i : i + 1] else: vals = result[i] @@ -2372,7 +2372,7 @@ def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: """ if values.ndim < ndim: - if not is_ea_dtype(values.dtype): + if not is_1d_only_ea_dtype(values.dtype): # TODO(EA2D): https://github.com/pandas-dev/pandas/issues/23023 # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 299ae08133099..6a7a7a4ffce13 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -25,12 +25,12 @@ find_common_type, ) from pandas.core.dtypes.common import ( + is_1d_only_ea_dtype, + is_1d_only_ea_obj, is_datetime64tz_dtype, is_dtype_equal, - is_ea_dtype, is_extension_array_dtype, is_sparse, - is_strict_ea, ) from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.missing import ( @@ -141,7 +141,7 @@ def concatenate_managers( else: values = concat_compat(vals) - if not is_strict_ea(values) and blk.ndim == 2 and values.ndim == 1: + if not is_1d_only_ea_obj(values) and blk.ndim == 2 and values.ndim == 1: values = values.reshape(1, -1) if blk.values.dtype == values.dtype: @@ -343,7 +343,7 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: return DatetimeArray(i8values, dtype=empty_dtype) elif is_extension_array_dtype(blk_dtype): pass - elif is_ea_dtype(empty_dtype): + elif is_1d_only_ea_dtype(empty_dtype): cls = empty_dtype.construct_array_type() missing_arr = cls._from_sequence([], dtype=empty_dtype) ncols, nrows = self.shape @@ -420,16 +420,16 @@ def _concatenate_join_units( else: concat_values = concat_values.copy() - elif any(is_strict_ea(t) for t in to_concat): + elif any(is_1d_only_ea_obj(t) for t in to_concat): # TODO(EA2D): special case not needed if all EAs used HybridBlocks # NB: we are still assuming here that Hybrid blocks have shape (1, N) # concatting with at least one EA means we are concatting a single column # the non-EA values are 2D arrays with shape (1, n) - to_concat = [t if is_strict_ea(t) else t[0, :] for t in to_concat] + to_concat = [t if is_1d_only_ea_obj(t) else t[0, :] for t in to_concat] concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) # TODO: what if we have dt64tz blocks with more than 1 column? - if concat_values.ndim < ndim and not is_strict_ea(concat_values): + if concat_values.ndim < ndim and not is_1d_only_ea_obj(concat_values): # if the result of concat is not an EA but an ndarray, reshape to # 2D to put it a non-EA Block # special case DatetimeArray, which *is* an EA, but is put in a diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index e2ab88e991080..a1529eb4f39b6 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -37,9 +37,9 @@ maybe_upcast, ) from pandas.core.dtypes.common import ( + is_1d_only_ea_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_ea_dtype, is_extension_array_dtype, is_integer_dtype, is_list_like, @@ -241,7 +241,7 @@ def ndarray_to_mgr( if not len(values) and columns is not None and len(columns): values = np.empty((0, 1), dtype=object) - if is_ea_dtype(values) or is_extension_array_dtype(dtype): + if is_1d_only_ea_dtype(values) or is_extension_array_dtype(dtype): # GH#19157 if isinstance(values, np.ndarray) and values.ndim > 1: @@ -256,7 +256,7 @@ def ndarray_to_mgr( return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) - if is_extension_array_dtype(values) and not is_ea_dtype(values): + if is_extension_array_dtype(values) and not is_1d_only_ea_dtype(values): # i.e. Datetime64TZ values = extract_array(values, extract_numpy=True) if copy: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index b97f9edec68b3..7edf5ab2086fa 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -38,9 +38,9 @@ ) from pandas.core.dtypes.common import ( DT64NS_DTYPE, + is_1d_only_ea_dtype, is_datetime64tz_dtype, is_dtype_equal, - is_ea_dtype, is_extension_array_dtype, is_list_like, ) @@ -1089,7 +1089,7 @@ def iset(self, loc: Union[int, slice, np.ndarray], value): if self._blklocs is None and self.ndim > 1: self._rebuild_blknos_and_blklocs() - value_is_extension_type = is_ea_dtype(value) + value_is_extension_type = is_1d_only_ea_dtype(value) # categorical/sparse/datetimetz if value_is_extension_type: diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6dd99d802faf8..3da66b7a0f9bb 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -17,8 +17,8 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( ensure_platform_int, + is_1d_only_ea_dtype, is_bool_dtype, - is_ea_dtype, is_extension_array_dtype, is_integer, is_integer_dtype, @@ -435,7 +435,7 @@ def unstack(obj, level, fill_value=None): f"index must be a MultiIndex to unstack, {type(obj.index)} was passed" ) else: - if is_ea_dtype(obj.dtype): + if is_1d_only_ea_dtype(obj.dtype): return _unstack_extension_series(obj, level, fill_value) unstacker = _Unstacker( obj.index, level=level, constructor=obj._constructor_expanddim diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index 0293dc63015cd..ba58d88fb4863 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -200,7 +200,6 @@ def test_append_dtypes(self): @pytest.mark.parametrize( "timestamp", ["2019-07-19 07:04:57+0100", "2019-07-19 07:04:57"] ) - @td.skip_array_manager_not_yet_implemented def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): # GH 30238 tz = tz_naive_fixture diff --git a/pandas/tests/frame/methods/test_infer_objects.py b/pandas/tests/frame/methods/test_infer_objects.py index 55020081f50f6..a824a615b5c29 100644 --- a/pandas/tests/frame/methods/test_infer_objects.py +++ b/pandas/tests/frame/methods/test_infer_objects.py @@ -1,13 +1,10 @@ from datetime import datetime -import pandas.util._test_decorators as td - from pandas import DataFrame import pandas._testing as tm class TestInferObjects: - @td.skip_array_manager_not_yet_implemented def test_infer_objects(self): # GH#11221 df = DataFrame( diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 0edf500568e28..6d1e90e2f9646 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -10,8 +10,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - import pandas as pd from pandas import ( DataFrame, @@ -663,7 +661,6 @@ def test_replace_regex_metachar(self, metachar): expected = DataFrame({"a": ["paren", "else"]}) tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented def test_replace(self, datetime_frame): datetime_frame["A"][:5] = np.nan datetime_frame["A"][-5:] = np.nan @@ -1027,7 +1024,6 @@ def test_replace_for_new_dtypes(self, datetime_frame): ), ], ) - @td.skip_array_manager_not_yet_implemented def test_replace_dtypes(self, frame, to_replace, value, expected): result = getattr(frame, "replace")(to_replace, value) tm.assert_frame_equal(result, expected) @@ -1496,7 +1492,6 @@ def test_replace_commutative(self, df, to_replace, exp): np.float64(1), ], ) - @td.skip_array_manager_not_yet_implemented def test_replace_replacer_dtype(self, replacer): # GH26632 df = DataFrame(["a"]) From d8b49cd49e4aad87044dfb14ab88a03b411001b2 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Mar 2021 17:45:25 -0800 Subject: [PATCH 33/61] CLN: use ensure_block_shape --- pandas/core/internals/array_manager.py | 13 +++++-------- pandas/core/internals/concat.py | 17 ++++++----------- 2 files changed, 11 insertions(+), 19 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 9b244303acd67..f69cd2e80f99a 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -34,7 +34,6 @@ soft_convert_objects, ) from pandas.core.dtypes.common import ( - is_1d_only_ea_obj, is_bool_dtype, is_datetime64_ns_dtype, is_dtype_equal, @@ -86,7 +85,10 @@ DataManager, SingleDataManager, ) -from pandas.core.internals.blocks import new_block +from pandas.core.internals.blocks import ( + ensure_block_shape, + new_block, +) if TYPE_CHECKING: from pandas import Float64Index @@ -490,12 +492,7 @@ def quantile( interpolation="linear", ) -> ArrayManager: - # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" - # has no attribute "reshape" - arrs = [ - x if is_1d_only_ea_obj(x) else x.reshape(1, -1) # type:ignore[union-attr] - for x in self.arrays - ] + arrs = [ensure_block_shape(x, 2) for x in self.arrays] assert axis == 1 new_arrs = [quantile_compat(x, qs, interpolation, axis=axis) for x in arrs] for i, arr in enumerate(new_arrs): diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 6a7a7a4ffce13..26812c2884ae0 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -41,7 +41,10 @@ import pandas.core.algorithms as algos from pandas.core.arrays import DatetimeArray from pandas.core.internals.array_manager import ArrayManager -from pandas.core.internals.blocks import new_block +from pandas.core.internals.blocks import ( + ensure_block_shape, + new_block, +) from pandas.core.internals.managers import BlockManager if TYPE_CHECKING: @@ -141,8 +144,7 @@ def concatenate_managers( else: values = concat_compat(vals) - if not is_1d_only_ea_obj(values) and blk.ndim == 2 and values.ndim == 1: - values = values.reshape(1, -1) + values = ensure_block_shape(values, blk.ndim) if blk.values.dtype == values.dtype: # Fast-path @@ -406,7 +408,6 @@ def _concatenate_join_units( ju.get_reindexed_values(empty_dtype=empty_dtype, upcasted_na=upcasted_na) for ju in join_units ] - ndim = max(x.ndim for x in to_concat) if len(to_concat) == 1: # Only one block, nothing to concatenate. @@ -429,13 +430,7 @@ def _concatenate_join_units( concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) # TODO: what if we have dt64tz blocks with more than 1 column? - if concat_values.ndim < ndim and not is_1d_only_ea_obj(concat_values): - # if the result of concat is not an EA but an ndarray, reshape to - # 2D to put it a non-EA Block - # special case DatetimeArray, which *is* an EA, but is put in a - # consolidated 2D block - # TODO(EA2D): we could just get this right within concat_compat - concat_values = concat_values.reshape(1, -1) + concat_values = ensure_block_shape(concat_values, 2) else: concat_values = concat_compat(to_concat, axis=concat_axis) From 84613c7826a5cebfa8da62cc150277e05b6822f5 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 7 Mar 2021 19:55:20 -0800 Subject: [PATCH 34/61] ENH: NDArrayBackedExtensionArray.fillna(method) with 2d --- pandas/core/arrays/_mixins.py | 8 +++++-- pandas/core/arrays/period.py | 8 +++++++ pandas/core/missing.py | 12 ++++------- pandas/tests/arrays/test_datetimes.py | 31 +++++++++++++++++++++++++++ pandas/tests/extension/base/dim2.py | 11 ++++++++++ 5 files changed, 60 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 4615cb4ec7abd..d54d1855ac2f8 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -278,8 +278,12 @@ def fillna( if mask.any(): if method is not None: - func = missing.get_fill_func(method) - new_values, _ = func(self._ndarray.copy(), limit=limit, mask=mask) + # TODO: check value is None + # (for now) when self.ndim == 2, we assume axis=0 + func = missing.get_fill_func(method, ndim=self.ndim) + new_values, _ = func(self._ndarray.T.copy(), limit=limit, mask=mask.T) + new_values = new_values.T + # TODO: PandasArray didn't used to copy, need tests for this new_values = self._from_backing_data(new_values) else: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 96a159c0804c9..7e9e13400e11f 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -639,6 +639,14 @@ def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: m8arr = self._ndarray.view("M8[ns]") return m8arr.searchsorted(value, side=side, sorter=sorter) + def fillna(self, value=None, method=None, limit=None) -> PeriodArray: + if method is not None: + # view as dt64 so we get treated as timelike in core.missing + dta = self.view("M8[ns]") + result = dta.fillna(value=value, method=method, limit=limit) + return result.view(self.dtype) + return super().fillna(value=value, method=method, limit=limit) + # ------------------------------------------------------------------ # Arithmetic Methods diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 1b5a7237b5287..dc42a175409c2 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -646,8 +646,6 @@ def interpolate_2d( values, ) - orig_values = values - transf = (lambda x: x) if axis == 0 else (lambda x: x.T) # reshape a 1 dim if needed @@ -669,10 +667,6 @@ def interpolate_2d( if ndim == 1: result = result[0] - if orig_values.dtype.kind in ["m", "M"]: - # convert float back to datetime64/timedelta64 - result = result.view(orig_values.dtype) - return result @@ -755,9 +749,11 @@ def _backfill_2d(values, limit=None, mask=None): _fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d} -def get_fill_func(method): +def get_fill_func(method, ndim: int = 1): method = clean_fill_method(method) - return _fill_methods[method] + if ndim == 1: + return _fill_methods[method] + return {"pad": _pad_2d, "backfill": _backfill_2d}[method] def clean_reindex_fill_method(method): diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index d159d76030250..8e6c330475e68 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -195,6 +195,37 @@ def test_fillna_preserves_tz(self, method): assert arr[2] is pd.NaT assert dti[2] == pd.Timestamp("2000-01-03", tz="US/Central") + def test_fillna_2d(self): + dti = pd.date_range("2016-01-01", periods=6, tz="US/Pacific") + dta = dti._data.reshape(3, 2).copy() + dta[0, 1] = pd.NaT + dta[1, 0] = pd.NaT + + res1 = dta.fillna(method="pad") + expected1 = dta.copy() + expected1[1, 0] = dta[0, 0] + tm.assert_extension_array_equal(res1, expected1) + + res2 = dta.fillna(method="backfill") + expected2 = dta.copy() + expected2 = dta.copy() + expected2[1, 0] = dta[2, 0] + expected2[0, 1] = dta[1, 1] + tm.assert_extension_array_equal(res2, expected2) + + # with different ordering for underlying ndarray; behavior should + # be unchanged + dta2 = dta._from_backing_data(dta._ndarray.copy(order="F")) + assert dta2._ndarray.flags["F_CONTIGUOUS"] + assert not dta2._ndarray.flags["C_CONTIGUOUS"] + tm.assert_extension_array_equal(dta, dta2) + + res3 = dta2.fillna(method="pad") + tm.assert_extension_array_equal(res3, expected1) + + res4 = dta2.fillna(method="backfill") + tm.assert_extension_array_equal(res4, expected2) + def test_array_interface_tz(self): tz = "US/Central" data = DatetimeArray(pd.date_range("2017", periods=2, tz=tz)) diff --git a/pandas/tests/extension/base/dim2.py b/pandas/tests/extension/base/dim2.py index fbe2537e8a7bf..073880d79d872 100644 --- a/pandas/tests/extension/base/dim2.py +++ b/pandas/tests/extension/base/dim2.py @@ -131,6 +131,17 @@ def test_concat_2d(self, data): with pytest.raises(ValueError): left._concat_same_type([left, right], axis=2) + @pytest.mark.parametrize("method", ["backfill", "pad"]) + def test_fillna_2d_method(self, data_missing, method): + arr = data_missing.repeat(2).reshape(2, 2) + assert arr[0].isna().all() + assert not arr[1].isna().any() + + result = arr.fillna(method=method) + + expected = data_missing.fillna(method=method).repeat(2).reshape(2, 2) + self.assert_extension_array_equal(result, expected) + @pytest.mark.parametrize("method", ["mean", "median", "var", "std", "sum", "prod"]) def test_reductions_2d_axis_none(self, data, method, request): if not hasattr(data, method): From 68fa34add2a705f9de92288f89896fd8e5460afb Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 10:36:44 -0800 Subject: [PATCH 35/61] dont consolidate DTZ blocks --- pandas/core/internals/blocks.py | 4 ++-- pandas/tests/io/pytables/test_timezones.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 1e41339e93007..749cbeec361c6 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2092,7 +2092,7 @@ class DatetimeTZBlock(DatetimeBlock, ExtensionBlock): is_extension = True _validate_ndim = True - _can_consolidate = True + _can_consolidate = False to_native_types = DatetimeBlock.to_native_types # needed for mypy @@ -2116,7 +2116,7 @@ class DatetimeTZBlock(DatetimeBlock, ExtensionBlock): get_block_values_for_json = Block.get_block_values_for_json # TODO: we still share these with ExtensionBlock (and not DatetimeBlock) - # ['interpolate', 'quantile'] + # ['interpolate'] # [x for x in dir(DatetimeTZBlock) if hasattr(ExtensionBlock, x) # and getattr(DatetimeTZBlock, x) is getattr(ExtensionBlock, x) # and getattr(ExtensionBlock, x) is not getattr(Block, x)] diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py index c2228b322929a..0532ddd17cd19 100644 --- a/pandas/tests/io/pytables/test_timezones.py +++ b/pandas/tests/io/pytables/test_timezones.py @@ -107,11 +107,11 @@ def test_append_with_timezones(setup_path, gettz): _compare_with_tz(result, df_crosses_dst) tm.assert_frame_equal(result, df_crosses_dst) - # df_mixed_tz has two blocks, whereas df_crosses_dst only has one, - # so we cannot set into the same pytables structure - assert df_crosses_dst._mgr.nblocks == 1 - assert df_mixed_tz._mgr.nblocks == 2 - msg = r"cannot match existing table structure for \[A,B\] on appending data" + msg = ( + r"invalid info for \[values_block_1\] for \[tz\], " + r"existing_value \[(dateutil/.*)?US/Eastern\] " + r"conflicts with new value \[(dateutil/.*)?EET\]" + ) with pytest.raises(ValueError, match=msg): store.append("df_tz", df_mixed_tz) From 3c824dca74ec9f37f07d0b77f1bb9abb96e7a17a Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 8 Mar 2021 10:37:55 -0800 Subject: [PATCH 36/61] revert pytables edits not needed without cosnolidation --- pandas/io/pytables.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 602786db73496..ceb4900b887f1 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4554,13 +4554,9 @@ def read( # if we have a DataIndexableCol, its shape will only be 1 dim if values.ndim == 1 and isinstance(values, np.ndarray): - # TODO(EA2D): special case not needed with 2D EAs values = values.reshape((1, values.shape[0])) - if isinstance(values, DatetimeIndex) and len(cols_) != 1: - # FIXME: kludge - values = values._data.reshape(len(cols_), -1, order="F") - if isinstance(values, (np.ndarray, DatetimeArray)): + if isinstance(values, np.ndarray): df = DataFrame(values.T, columns=cols_, index=index_) elif isinstance(values, Index): df = DataFrame(values, columns=cols_, index=index_) From d3ae4482388785f882dc8f1f5914f12544b649b6 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 9 Mar 2021 17:45:55 -0800 Subject: [PATCH 37/61] mypy fixup --- pandas/core/internals/blocks.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index f36f9f82160f1..a53c62bb80eff 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2085,7 +2085,6 @@ class DatetimeTZBlock(DatetimeBlock, ExtensionBlock): # "Callable[[Block], Tuple[int, ...]]", base class "ExtensionBlock" # defined the type as "Tuple[int, ...]") shape = Block.shape # type:ignore[assignment] - __init__ = Block.__init__ # Incompatible types in assignment (expression has type # "Callable[[Arg(Any, 'indexer'), Arg(int, 'axis'), # DefaultArg(Any, 'new_mgr_locs'), DefaultArg(Any, 'fill_value')], Block]", From 11e6182b082c9ad85b796e5a29c629f1f3af2faa Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 10 Mar 2021 08:30:20 -0800 Subject: [PATCH 38/61] Fix json kludge --- pandas/core/sorting.py | 4 +--- pandas/tests/extension/json/array.py | 9 ++++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 511f9a6767d1d..1ddb4223555ab 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -419,9 +419,7 @@ def nargminmax(values, method: str, axis: int = 0): values = values._values_for_argsort() idx = np.arange(values.shape[axis]) - if values.ndim > 1 and values.size > 0: - # FIXME: values.size check is a kludge bc JSONArray can come - # back with size-0 2D + if values.ndim > 1: if mask.any(): raise NotImplementedError return func(values, axis=axis) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index ca593da6d97bc..525a337b63f74 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -30,6 +30,7 @@ import numpy as np +from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike from pandas.core.dtypes.common import pandas_dtype import pandas as pd @@ -207,11 +208,9 @@ def _values_for_factorize(self): return frozen, () def _values_for_argsort(self): - # Disable NumPy's shape inference by including an empty tuple... - # If all the elements of self are the same size P, NumPy will - # cast them to an (N, P) array, instead of an (N,) array of tuples. - frozen = [()] + [tuple(x.items()) for x in self] - return np.array(frozen, dtype=object)[1:] + # Bypass NumPy's shape inference to get a (N,) array of tuples. + frozen = [tuple(x.items()) for x in self] + return construct_1d_object_array_from_listlike(frozen) def make_data(): From 1216f507912a2e8e4e865e701a6e49266722ee3f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 15:28:52 -0800 Subject: [PATCH 39/61] troubleshoot array-manager testt --- pandas/core/internals/array_manager.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index ea0a5eac6f8c8..0e38b5af20c81 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -478,8 +478,7 @@ def apply_with_block(self: T, f, align_keys=None, swap_axis=True, **kwargs) -> T arr = arr._data # type: ignore[attr-defined] if self.ndim == 2: - if isinstance(arr, np.ndarray): - arr = np.atleast_2d(arr) + arr = ensure_block_shape(arr, 2) block = new_block(arr, placement=slice(0, 1, 1), ndim=2) else: block = new_block(arr, placement=slice(0, len(self), 1), ndim=1) From 5a02f3e02c8dd4521b02f4573663617f51a73c1b Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 18:33:11 -0800 Subject: [PATCH 40/61] troubleshoot mypy --- pandas/core/frame.py | 11 ++++++----- pandas/core/internals/construction.py | 4 ++++ pandas/core/internals/managers.py | 10 +++++----- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7612f97b1859a..53dffdda42f45 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -911,6 +911,9 @@ def _values_compat(self) -> Union[np.ndarray, DatetimeArray, TimedeltaArray]: if arr.ndim == 1: # non-2D ExtensionArray return self._values + + # more generally, whatever we allow in NDArrayBackedExtensionBlock + arr = cast(Union[DatetimeArray, TimedeltaArray], arr) return arr.T # ---------------------------------------------------------------------- @@ -3344,13 +3347,11 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: if self._can_fast_transpose: # Note: tests pass without this, but this improves perf quite a bit. - new_values = self._values_compat.T + new_vals = self._values_compat.T if copy: - new_values = new_values.copy() + new_vals = new_vals.copy() - result = self._constructor( - new_values, index=self.columns, columns=self.index - ) + result = self._constructor(new_vals, index=self.columns, columns=self.index) elif ( self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0]) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index ac8fde71bb405..27c882b0f3649 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -76,6 +76,7 @@ ) from pandas.core.internals.array_manager import ArrayManager from pandas.core.internals.blocks import ( + Block, ensure_block_shape, new_block, ) @@ -308,6 +309,9 @@ def ndarray_to_mgr( ) values = values.T + # TODO: GH#40403 standardize what types we have here + block_values: Union[List[Block], List[ArrayLike]] + # if we don't have a dtype specified, then try to convert objects # on the entire block; this is to convert if we have datetimelike's # embedded in an object type diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d70052a54035e..df95c30145ed4 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -14,6 +14,7 @@ Tuple, TypeVar, Union, + cast, ) import warnings @@ -52,6 +53,7 @@ ) import pandas.core.algorithms as algos +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import extract_array from pandas.core.indexers import maybe_convert_indices @@ -2008,11 +2010,9 @@ def _merge_blocks( # Sequence[Sequence[Any]], SupportsArray]] new_values = np.vstack([b.values for b in blocks]) # type: ignore[misc] else: - # Unexpected keyword argument "axis" for "_concat_same_type" - # of "ExtensionArray" - new_values = blocks[0].values._concat_same_type( - [b.values for b in blocks], axis=0 # type:ignore[call-arg] - ) + bvals = [blk.values for blk in blocks] + bvals = cast(List[NDArrayBackedExtensionArray], bvals) + new_values = bvals[0]._concat_same_type(bvals, axis=0) argsort = np.argsort(new_mgr_locs) new_values = new_values[argsort] From f8f962f344048e8c837caa9f2bb6a95a9d7fcbc6 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 12 Mar 2021 20:59:12 -0800 Subject: [PATCH 41/61] missing import --- pandas/core/frame.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 53dffdda42f45..a930a46a7f3e8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -144,7 +144,11 @@ ) from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin -from pandas.core.arrays import ExtensionArray +from pandas.core.arrays import ( + DatetimeArray, + ExtensionArray, + TimedeltaArray, +) from pandas.core.arrays.sparse import SparseFrameAccessor from pandas.core.construction import ( extract_array, @@ -215,10 +219,6 @@ TimestampConvertibleTypes, ) - from pandas.core.arrays import ( - DatetimeArray, - TimedeltaArray, - ) from pandas.core.groupby.generic import DataFrameGroupBy from pandas.core.resample import Resampler From dab6fd20c352da028ac6c9374a90a0906e869929 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 14 Mar 2021 09:20:30 -0700 Subject: [PATCH 42/61] fastparquet compat --- pandas/_typing.py | 2 +- pandas/compat/pickle_compat.py | 4 +++ pandas/core/arraylike.py | 2 +- pandas/core/frame.py | 6 ++-- pandas/core/generic.py | 2 +- pandas/core/internals/__init__.py | 6 ++-- pandas/core/internals/api.py | 41 +++++++++++++++++++++++- pandas/core/internals/concat.py | 8 +---- pandas/io/pytables.py | 2 +- pandas/tests/internals/test_internals.py | 2 +- pandas/tests/internals/test_managers.py | 6 ++-- 11 files changed, 58 insertions(+), 23 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 3e584774e539a..760908a06644a 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -57,10 +57,10 @@ from pandas.core.indexes.base import Index from pandas.core.internals import ( ArrayManager, - BlockManager, SingleArrayManager, SingleBlockManager, ) + from pandas.core.internals.managers import BlockManager from pandas.core.resample import Resampler from pandas.core.series import Series from pandas.core.window.rolling import BaseWindow diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index 9d48035213126..4ce1642a3f2a7 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -181,6 +181,10 @@ def __new__(cls) -> DataFrame: # type: ignore[misc] "pandas.compat.pickle_compat", "_LoadSparseFrame", ), + ("pandas.core.internals", "BlockManager"): ( + "pandas.core.internals.managers", + "BlockManager", + ), } diff --git a/pandas/core/arraylike.py b/pandas/core/arraylike.py index 588fe8adc7241..dbe0c4b31d9bd 100644 --- a/pandas/core/arraylike.py +++ b/pandas/core/arraylike.py @@ -243,7 +243,7 @@ def array_ufunc(self, ufunc: np.ufunc, method: str, *inputs: Any, **kwargs: Any) numpy.org/doc/stable/reference/arrays.classes.html#numpy.class.__array_ufunc__ """ from pandas.core.generic import NDFrame - from pandas.core.internals import BlockManager + from pandas.core.internals.managers import BlockManager cls = type(self) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a930a46a7f3e8..baa82c71a76a4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -176,10 +176,7 @@ check_bool_indexer, convert_to_index_sliceable, ) -from pandas.core.internals import ( - ArrayManager, - BlockManager, -) +from pandas.core.internals import ArrayManager from pandas.core.internals.construction import ( arrays_to_mgr, dataclasses_to_dicts, @@ -192,6 +189,7 @@ to_arrays, treat_as_nested, ) +from pandas.core.internals.managers import BlockManager from pandas.core.reshape.melt import melt from pandas.core.series import Series from pandas.core.sorting import ( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 67533259ae0c2..9570a8982889d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -137,10 +137,10 @@ ) from pandas.core.internals import ( ArrayManager, - BlockManager, SingleArrayManager, ) from pandas.core.internals.construction import mgr_to_mgr +from pandas.core.internals.managers import BlockManager from pandas.core.missing import find_valid_index from pandas.core.ops import align_method_FRAME from pandas.core.reshape.concat import concat diff --git a/pandas/core/internals/__init__.py b/pandas/core/internals/__init__.py index ea9f9abc4a4c7..df5e71d096387 100644 --- a/pandas/core/internals/__init__.py +++ b/pandas/core/internals/__init__.py @@ -1,4 +1,7 @@ -from pandas.core.internals.api import make_block # pseudo-public version +from pandas.core.internals.api import ( # pseudo-public version + BlockManager, + make_block, +) from pandas.core.internals.array_manager import ( ArrayManager, SingleArrayManager, @@ -20,7 +23,6 @@ ) from pandas.core.internals.concat import concatenate_managers from pandas.core.internals.managers import ( - BlockManager, SingleBlockManager, create_block_manager_from_arrays, create_block_manager_from_blocks, diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index c52753f95449a..22b7e26ea3ce2 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -6,7 +6,14 @@ 2) Use only functions exposed here (or in core.internals) """ -from typing import Optional +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Optional, + Sequence, + cast, +) import numpy as np @@ -24,6 +31,10 @@ extract_pandas_array, get_block_type, ) +from pandas.core.internals.managers import BlockManager as BM + +if TYPE_CHECKING: + from pandas import Index def make_block( @@ -81,3 +92,31 @@ def _maybe_infer_ndim(values, placement: BlockPlacement, ndim: Optional[int]) -> else: ndim = values.ndim return ndim + + +# TODO: deprecate following https://github.com/dask/fastparquet/pull/571 +class BlockManager(BM): + """ + For backwards-compatibility for fastparquet, we patch incorrectly-shaped + blocks on construction. + """ + + def __new__( + cls, + blocks: Sequence[Block], + axes: Sequence[Index], + verify_integrity: bool = True, + ): + blocks = list(blocks) + for blk in blocks: + if isinstance(blk, DatetimeTZBlock): + fixed = ensure_block_shape(blk.values, 2) + fixed = cast(DatetimeArray, fixed) + blk.values = fixed + + return BM(blocks, axes, verify_integrity=verify_integrity) + + def __init__(self, *args, **kwargs): + raise NotImplementedError( + "No instance of this wrapper class should ever by fully instantiated." + ) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 5c0425e0c2515..11213f3739ad4 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -139,13 +139,7 @@ def concatenate_managers( # than concat_compat values = np.concatenate(vals, axis=blk.ndim - 1) else: - # TODO(EA2D): special-casing not needed with 2D EAs - if all(x.ndim == blk.ndim for x in vals): - # i.e. DTA/TDA - values = concat_compat(vals, axis=blk.ndim - 1) - else: - values = concat_compat(vals) - + values = concat_compat(vals, axis=1) values = ensure_block_shape(values, blk.ndim) if blk.values.dtype == values.dtype: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 02a723902271e..434aaf1e09af6 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -92,7 +92,7 @@ ) from pandas.core.construction import extract_array from pandas.core.indexes.api import ensure_index -from pandas.core.internals import BlockManager +from pandas.core.internals.managers import BlockManager from pandas.io.common import stringify_path from pandas.io.formats.printing import ( diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 76359187c42c1..5c0db4190dc32 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -32,11 +32,11 @@ TimedeltaArray, ) from pandas.core.internals import ( - BlockManager, SingleBlockManager, make_block, ) from pandas.core.internals.blocks import new_block +from pandas.core.internals.managers import BlockManager @pytest.fixture(params=[new_block, make_block]) diff --git a/pandas/tests/internals/test_managers.py b/pandas/tests/internals/test_managers.py index 4ca6e8b6598aa..e41b24ea54ab0 100644 --- a/pandas/tests/internals/test_managers.py +++ b/pandas/tests/internals/test_managers.py @@ -5,10 +5,8 @@ import pandas as pd import pandas._testing as tm -from pandas.core.internals import ( - ArrayManager, - BlockManager, -) +from pandas.core.internals import ArrayManager +from pandas.core.internals.managers import BlockManager def test_dataframe_creation(): From 42ca357a11553ec538893cb61d703d0bca33296d Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 14 Mar 2021 20:00:12 -0700 Subject: [PATCH 43/61] maybe_coerce_values where appropriate --- pandas/core/internals/blocks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 60a417beb56c2..6622db42799bc 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1165,6 +1165,7 @@ def _interpolate_with_fill( limit_area=limit_area, ) + values = maybe_coerce_values(values) blocks = [self.make_block_same_class(values)] return self._maybe_downcast(blocks, downcast) @@ -1221,6 +1222,7 @@ def func(yvalues: np.ndarray) -> np.ndarray: # interp each column independently interp_values = np.apply_along_axis(func, axis, data) + interp_values = maybe_coerce_values(interp_values) blocks = [self.make_block_same_class(interp_values)] return self._maybe_downcast(blocks, downcast) @@ -1927,13 +1929,11 @@ def diff(self, n: int, axis: int = 0) -> List[Block]: values = self.values new_values = values - values.shift(n, axis=axis) - new_values = maybe_coerce_values(new_values) return [self.make_block(new_values)] def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> List[Block]: values = self.values new_values = values.shift(periods, fill_value=fill_value, axis=axis) - new_values = maybe_coerce_values(new_values) return [self.make_block_same_class(new_values)] def fillna( From 9e25bd5692dc54651e0365469cc2682093ff78f7 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 17 Mar 2021 20:39:30 -0700 Subject: [PATCH 44/61] update exception message --- pandas/tests/groupby/test_groupby.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 50572665db6b5..de508b8cd78ec 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -820,7 +820,7 @@ def test_groupby_multi_corner(df): tm.assert_frame_equal(agged, expected) -def test_omit_nuisance(df, using_array_manager): +def test_omit_nuisance(df): grouped = df.groupby("A") result = grouped.mean() @@ -841,8 +841,6 @@ def test_omit_nuisance(df, using_array_manager): # won't work with axis = 1 grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1) msg = "'DatetimeArray' does not implement reduction 'sum'" - if using_array_manager: - msg = "reduction operation 'sum' not allowed for this dtype" with pytest.raises(TypeError, match=msg): grouped.agg(lambda x: x.sum(0, numeric_only=False)) From b775a29a0764b2a204eab88c9aa91819404f2e17 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 22 Mar 2021 07:47:49 -0700 Subject: [PATCH 45/61] comment --- pandas/core/frame.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e24bd524f3e6e..5ec69eab66c4e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9543,6 +9543,7 @@ def _reduce( stacklevel=5, ) cols = self.columns[~dtype_is_dt] + # TODO: avoid making a copy here self = self[cols] # TODO: Make other agg func handle axis=None properly GH#21597 From cbf037008ab970fdd3cddd361ce41c78f820f396 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 30 Mar 2021 10:45:37 -0700 Subject: [PATCH 46/61] remove commented-out --- pandas/core/internals/concat.py | 2 +- pandas/core/internals/construction.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 9945410207c70..29f9ae1798642 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -435,7 +435,7 @@ def _concatenate_join_units( # the non-EA values are 2D arrays with shape (1, n) # error: Invalid index type "Tuple[int, slice]" for - # "Union[ExtensionArray, ndarray]"; expected type "Union[int, slice, ndarray]" + # "Union[ExtensionArray, ndarray]"; expected type "Union[int, slice, ndarray]" to_concat = [ t if is_1d_only_ea_obj(t) else t[0, :] # type: ignore[index] for t in to_concat diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2b2a70fffe4c3..61c4f9746cf2a 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -340,7 +340,6 @@ def ndarray_to_mgr( else: datelike_vals = maybe_infer_to_datetimelike(values) - # datelike_vals = ensure_block_shape(datelike_vals, 2) nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2) block_values = [nb] else: From 319c22d9c62269915db1164372bf23b6a83f5c15 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 30 Mar 2021 11:21:21 -0700 Subject: [PATCH 47/61] mypy fixup --- pandas/core/internals/blocks.py | 11 +++-------- pandas/core/internals/managers.py | 4 +--- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 701ba6903672b..e4835b658b694 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -5,7 +5,6 @@ from typing import ( TYPE_CHECKING, Any, - Callable, List, Optional, Tuple, @@ -599,8 +598,6 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): Block """ values = self.values - if values.dtype.kind in ["m", "M"]: - values = self.array_values new_values = astype_array_safe(values, dtype, copy=copy, errors=errors) @@ -1749,14 +1746,12 @@ class HybridMixin: Mixin for Blocks backed (maybe indirectly) by ExtensionArrays. """ - array_values: Callable - def _can_hold_element(self, element: Any) -> bool: - values = self.array_values + # error: "HybridMixin" has no attribute "values" + values = self.values # type: ignore[attr-defined] try: - # error: "Callable[..., Any]" has no attribute "_validate_setitem_value" - values._validate_setitem_value(element) # type: ignore[attr-defined] + values._validate_setitem_value(element) return True except (ValueError, TypeError): return False diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5d9f107e8a79c..c8eed87c74b14 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -360,9 +360,7 @@ def unpickle_block(values, mgr_locs, ndim: int): if is_datetime64tz_dtype(vals.dtype): # older versions will hold in DatetimeIndex instead of DTA vals = extract_array(vals, extract_numpy=True) - if vals.ndim == 1 and ndim == 2: - vals = vals.reshape(1, -1) - blk["values"] = vals + blk["values"] = ensure_block_shape(vals, ndim=ndim) self.blocks = tuple( unpickle_block(b["values"], b["mgr_locs"], ndim=ndim) From 1c493170edbe99354f530936dcb1154b7a9b3c55 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 31 Mar 2021 09:44:52 -0700 Subject: [PATCH 48/61] de-privatize --- pandas/core/dtypes/concat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 0e46f61d042ab..475aa3761d575 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -111,13 +111,13 @@ def is_nonempty(x) -> bool: to_concat = non_empties kinds = {obj.dtype.kind for obj in to_concat} - _contains_datetime = any(kind in ["m", "M"] for kind in kinds) + contains_datetime = any(kind in ["m", "M"] for kind in kinds) all_empty = not len(non_empties) single_dtype = len({x.dtype for x in to_concat}) == 1 any_ea = any(isinstance(x.dtype, ExtensionDtype) for x in to_concat) - if _contains_datetime: + if contains_datetime: return _concat_datetime(to_concat, axis=axis) if any_ea: From 780fa1cd93f239addb0753b1444f009bb2fb2272 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 2 Apr 2021 09:55:39 -0700 Subject: [PATCH 49/61] one more kludge revert --- pandas/core/frame.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1107bb7db6873..02ba678f843de 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -178,7 +178,10 @@ check_bool_indexer, convert_to_index_sliceable, ) -from pandas.core.internals import ArrayManager +from pandas.core.internals import ( + ArrayManager, + BlockManager, +) from pandas.core.internals.construction import ( arrays_to_mgr, dataclasses_to_dicts, @@ -191,7 +194,6 @@ to_arrays, treat_as_nested, ) -from pandas.core.internals.managers import BlockManager from pandas.core.reshape.melt import melt from pandas.core.series import Series from pandas.core.sorting import ( From da9f50cf94c99e3363e51ca495175ffc007f7fbe Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 2 Apr 2021 09:56:02 -0700 Subject: [PATCH 50/61] one more kludge revert --- pandas/io/pytables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 964f2e93bf301..8a3e3ea556bea 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -92,7 +92,7 @@ ) from pandas.core.construction import extract_array from pandas.core.indexes.api import ensure_index -from pandas.core.internals.managers import BlockManager +from pandas.core.internals import BlockManager from pandas.io.common import stringify_path from pandas.io.formats.printing import ( From acc707da357740ba739d717fb0a4a1e680b17c7b Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 2 Apr 2021 15:09:33 -0700 Subject: [PATCH 51/61] trim diff --- pandas/core/array_algos/take.py | 2 +- pandas/core/sorting.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index 714bc6b4011a0..f34bfd75fd961 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -99,7 +99,7 @@ def take_nd( # i.e. DatetimeArray, TimedeltaArray arr = cast("NDArrayBackedExtensionArray", arr) return arr.take( - indexer, axis=axis, fill_value=fill_value, allow_fill=allow_fill + indexer, fill_value=fill_value, allow_fill=allow_fill, axis=axis ) return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 9d2fba823cfd6..816c1d9195778 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -407,7 +407,7 @@ def nargminmax(values, method: str, axis: int = 0): ---------- values : ExtensionArray method : {"argmax", "argmin"} - axis : int, default 0 + axis: int, default 0 Returns ------- From a0c4d0a2912eb2f25dc70ca88501a8fd626b98b3 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 3 Apr 2021 16:03:02 -0700 Subject: [PATCH 52/61] trim diff --- pandas/core/array_algos/quantile.py | 2 +- pandas/core/internals/blocks.py | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 640f61b729d31..88616aa66d117 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -32,7 +32,7 @@ def quantile_compat(values: ArrayLike, qs: np.ndarray, interpolation: str) -> Ar ------- np.ndarray or ExtensionArray """ - if isinstance(values.dtype, np.dtype): + if isinstance(values, np.ndarray): # i.e. np.ndarray, DatetimeArray, TimedeltaArray fill_value = na_value_for_dtype(values.dtype, compat=False) mask = isna(values) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index d4f84bf14e1d8..cedb6f617a2f3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1650,7 +1650,7 @@ class NumericBlock(Block): class NDArrayBackedExtensionBlock(Block): """ - Block backed by an NDArrayBackedExtensionArray, supporting 2D values. + Block backed by an NDArrayBackedExtensionArray """ values: NDArrayBackedExtensionArray @@ -1659,6 +1659,12 @@ class NDArrayBackedExtensionBlock(Block): def array_values(self) -> NDArrayBackedExtensionArray: return self.values + @property + def is_view(self) -> bool: + """ return a boolean if I am possibly a view """ + # check the ndarray values of the DatetimeIndex values + return self.values._ndarray.base is not None + def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: # We override instead of putting the np.asarray in Block.values for # performance. @@ -1688,12 +1694,6 @@ def where(self, other, cond, errors="raise") -> List[Block]: nb = self.make_block_same_class(res_values) return [nb] - @property - def is_view(self) -> bool: - """ return a boolean if I am possibly a view """ - # check the ndarray values of the DatetimeIndex values - return self.values._ndarray.base is not None - def setitem(self, indexer, value): if not self._can_hold_element(value): # TODO: general case needs casting logic. From 2f6236b36cb3d9679d87a6e39b4c91e0ca1c0c86 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 4 Apr 2021 14:16:36 -0700 Subject: [PATCH 53/61] trim diff --- pandas/core/array_algos/quantile.py | 1 - pandas/core/frame.py | 1 - pandas/core/internals/array_manager.py | 2 -- pandas/core/internals/construction.py | 4 +--- 4 files changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 88616aa66d117..efa36a5bd3ae9 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -33,7 +33,6 @@ def quantile_compat(values: ArrayLike, qs: np.ndarray, interpolation: str) -> Ar np.ndarray or ExtensionArray """ if isinstance(values, np.ndarray): - # i.e. np.ndarray, DatetimeArray, TimedeltaArray fill_value = na_value_for_dtype(values.dtype, compat=False) mask = isna(values) return _quantile_with_mask(values, mask, fill_value, qs, interpolation) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 02ba678f843de..0d26060d5ae6a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -9593,7 +9593,6 @@ def _reduce( stacklevel=5, ) cols = self.columns[~dtype_is_dt] - # TODO: avoid making a copy here self = self[cols] # TODO: Make other agg func handle axis=None properly GH#21597 diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index e6c6683148fad..5a1fea539bdce 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -131,7 +131,6 @@ def __init__( # Note: we are storing the axes in "_axes" in the (row, columns) order # which contrasts the order how it is stored in BlockManager self._axes = axes - self.arrays = arrays if verify_integrity: @@ -496,7 +495,6 @@ def apply_with_block(self: T, f, align_keys=None, swap_axis=True, **kwargs) -> T if self.ndim == 2 and arr.ndim == 2: # 2D for np.ndarray or DatetimeArray/TimedeltaArray assert len(arr) == 1 - # error: Invalid index type "Tuple[int, slice]" for # "Union[ndarray, ExtensionArray]"; expected type # "Union[int, slice, ndarray]" diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 85de26ea62493..ada23b4784e26 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -347,12 +347,10 @@ def ndarray_to_mgr( nb = new_block(datelike_vals, placement=slice(len(columns)), ndim=2) block_values = [nb] else: - new_values = values - nb = new_block(new_values, placement=slice(len(columns)), ndim=2) + nb = new_block(values, placement=slice(len(columns)), ndim=2) block_values = [nb] if len(columns) == 0: - # TODO: require len(values) == 0? block_values = [] return create_block_manager_from_blocks(block_values, [columns, index]) From 28f241b3f3f66f9af8f6e512d0dfdfcbf4c8eb7f Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 5 Apr 2021 09:43:41 -0700 Subject: [PATCH 54/61] REF: implement EABackedBlock --- pandas/core/internals/blocks.py | 57 +++++++++++++++------------------ 1 file changed, 25 insertions(+), 32 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 9a2b3be4b66e2..1813af903fc75 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1358,6 +1358,31 @@ def delete(self, loc) -> None: # _cache not yet initialized pass + @cache_readonly + def array_values(self) -> ExtensionArray: + return self.values + + def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: + """ + return object dtype as boxed values, such as Timestamps/Timedelta + """ + values = self.values + if dtype == _dtype_obj: + values = values.astype(object) + # TODO(EA2D): reshape not needed with 2D EAs + return np.asarray(values).reshape(self.shape) + + def interpolate( + self, method="pad", axis=0, inplace=False, limit=None, fill_value=None, **kwargs + ): + values = self.values + if values.ndim == 2 and axis == 0: + # NDArrayBackedExtensionArray.fillna assumes axis=1 + new_values = values.T.fillna(value=fill_value, method=method, limit=limit).T + else: + new_values = values.fillna(value=fill_value, method=method, limit=limit) + return self.make_block_same_class(new_values) + class ExtensionBlock(EABackedBlock): """ @@ -1482,15 +1507,6 @@ def setitem(self, indexer, value): self.values[indexer] = value return self - def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: - # ExtensionArrays must be iterable, so this works. - # TODO(EA2D): reshape not needed with 2D EAs - return np.asarray(self.values).reshape(self.shape) - - @cache_readonly - def array_values(self) -> ExtensionArray: - return self.values - def take_nd( self, indexer, @@ -1562,12 +1578,6 @@ def fillna( values = self.values.fillna(value=value, limit=limit) return [self.make_block_same_class(values=values)] - def interpolate( - self, method="pad", axis=0, inplace=False, limit=None, fill_value=None, **kwargs - ): - new_values = self.values.fillna(value=fill_value, method=method, limit=limit) - return self.make_block_same_class(new_values) - def diff(self, n: int, axis: int = 1) -> List[Block]: if axis == 0 and n != 0: # n==0 case will be a no-op so let is fall through @@ -1675,27 +1685,12 @@ class NDArrayBackedExtensionBlock(EABackedBlock): values: NDArrayBackedExtensionArray - @property - def array_values(self) -> NDArrayBackedExtensionArray: - return self.values - @property def is_view(self) -> bool: """ return a boolean if I am possibly a view """ # check the ndarray values of the DatetimeIndex values return self.values._ndarray.base is not None - def get_values(self, dtype: Optional[DtypeObj] = None) -> np.ndarray: - """ - return object dtype as boxed values, such as Timestamps/Timedelta - """ - values = self.values - if dtype == _dtype_obj: - # DTA/TDA constructor and astype can handle 2D - values = values.astype(object) - # TODO(EA2D): reshape not needed with 2D EAs - return np.asarray(values).reshape(self.shape) - def iget(self, key): # GH#31649 we need to wrap scalars in Timestamp/Timedelta # TODO(EA2D): this can be removed if we ever have 2D EA @@ -1799,8 +1794,6 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeLikeBlock): putmask = NDArrayBackedExtensionBlock.putmask fillna = NDArrayBackedExtensionBlock.fillna - get_values = NDArrayBackedExtensionBlock.get_values - # error: Incompatible types in assignment (expression has type # "Callable[[NDArrayBackedExtensionBlock], bool]", base class "ExtensionBlock" # defined the type as "bool") [assignment] From 845781825f4134cf0e8f15cf5fbc6284dcb57dc4 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 5 Apr 2021 09:54:34 -0700 Subject: [PATCH 55/61] TST: test_delitem_series --- pandas/tests/extension/base/setitem.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 34c9c097fbfd5..0392ea794237c 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -356,3 +356,14 @@ def test_setitem_series(self, data, full_indexer): data.astype(object), index=ser.index, name="data", dtype=object ) self.assert_series_equal(result, expected) + + def test_delitem_series(self, data): + # GH#40763 + ser = pd.Series(data, name="data") + + taker = np.arange(len(ser)) + taker = np.delete(taker, 1) + + expected = ser[taker] + del ser[1] + self.assert_series_equal(ser, expected) From 207f41a49272c8b1dffb579501c4d840671d63e7 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 7 Apr 2021 10:01:13 -0700 Subject: [PATCH 56/61] pre-commit fixup --- pandas/core/internals/api.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/api.py b/pandas/core/internals/api.py index 8a06d665d4a86..2f8686fd38929 100644 --- a/pandas/core/internals/api.py +++ b/pandas/core/internals/api.py @@ -8,8 +8,6 @@ """ from __future__ import annotations -from typing import Optional - import numpy as np from pandas._libs.internals import BlockPlacement @@ -33,7 +31,7 @@ def make_block( - values, placement, klass=None, ndim=None, dtype: Optional[Dtype] = None + values, placement, klass=None, ndim=None, dtype: Dtype | None = None ) -> Block: """ This is a pseudo-public analogue to blocks.new_block. @@ -73,7 +71,7 @@ def make_block( return klass(values, ndim=ndim, placement=placement) -def maybe_infer_ndim(values, placement: BlockPlacement, ndim: Optional[int]) -> int: +def maybe_infer_ndim(values, placement: BlockPlacement, ndim: int | None) -> int: """ If `ndim` is not provided, infer it from placment and values. """ From 2637cf435d4489f3f7f58a2f3acb62f100f21dae Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 7 Apr 2021 14:43:05 -0700 Subject: [PATCH 57/61] revert no-longer-needed --- pandas/tests/extension/test_numpy.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 4c8c774abbdc7..35e5abe9ce4e7 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -60,27 +60,6 @@ def dtype(request): return PandasDtype(np.dtype(request.param)) -def __init__(self, values, placement, ndim): - # libinternals.Block.__cinit__ gets called automatically before __init__, - # after which this __init__ is called - if not isinstance(placement, blocks.libinternals.BlockPlacement): - placement = blocks.libinternals.BlockPlacement(placement) - - # Maybe infer ndim from placement - if ndim is None: - if len(placement) != 1: - ndim = 1 - else: - ndim = 2 - - if isinstance(values, PandasArray) and values.ndim > ndim: - assert values.shape[0] == 1 - values = values[0] - - self.values = values - self.placement = placement - - @pytest.fixture def allow_in_pandas(monkeypatch): """ @@ -100,7 +79,6 @@ def allow_in_pandas(monkeypatch): """ with monkeypatch.context() as m: m.setattr(PandasArray, "_typ", "extension") - m.setattr(blocks.ExtensionBlock, "__init__", __init__) m.setattr(managers, "_extract_array", _extract_array_patched) m.setattr(blocks, "can_hold_element", _can_hold_element_patched) yield From 4d8bf6a51a2c066bc63360a7adb70ac32b4f5bde Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 13 Apr 2021 14:27:42 -0700 Subject: [PATCH 58/61] restore import --- pandas/core/internals/concat.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 7f7057ba3bab8..04c802eb4fe0f 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -45,6 +45,7 @@ from pandas.core.arrays import ( Categorical, DatetimeArray, + ExtensionArray, ) from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.internals.array_manager import ( From 2562b8fb7e65ee31a3991021097e4b1fee703900 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 22 Apr 2021 16:44:39 -0700 Subject: [PATCH 59/61] revert removal of TDA check --- pandas/core/internals/construction.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 66f5cb9298db8..83ecdbce5fa80 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -57,6 +57,7 @@ from pandas.core.arrays import ( Categorical, ExtensionArray, + TimedeltaArray, ) from pandas.core.construction import ( extract_array, @@ -486,6 +487,11 @@ def treat_as_nested(data) -> bool: def _prep_ndarray(values, copy: bool = True) -> np.ndarray: + if isinstance(values, TimedeltaArray): + # On older numpy, np.asarray below apparently does not call __array__, + # so nanoseconds get dropped. + values = values._ndarray + if not isinstance(values, (np.ndarray, ABCSeries, Index)): if len(values) == 0: return np.empty((0, 0), dtype=object) From 96f0323c005d45530d6d4897db99b40684ab93d5 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 23 Apr 2021 07:58:23 -0700 Subject: [PATCH 60/61] remove extra get_values_for_json --- pandas/core/internals/blocks.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4276aadd8edd6..5b78cd4a5accf 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1778,10 +1778,6 @@ class DatetimeLikeBlock(NDArrayBackedExtensionBlock): is_numeric = False values: DatetimeArray | TimedeltaArray - def get_block_values_for_json(self): - # Not necessary to override, but helps perf - return self.values._ndarray - class DatetimeTZBlock(DatetimeLikeBlock): """ implement a datetime64 block with a tz attribute """ From 6fded81dd3741ba725d9c0c9e97fc2f026f3e47d Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 17 May 2021 08:51:32 -0700 Subject: [PATCH 61/61] whatsnew perf note --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index f6b5c30635980..ba0c4b99f861f 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -673,6 +673,7 @@ Performance improvements - Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`) - Performance improvement in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable data types (:issue:`37493`) - Performance improvement in :meth:`Series.nunique` with nan values (:issue:`40865`) +- Performance improvement in :meth:`DataFrame.transpose`, :meth:`Series.unstack` with ``DatetimeTZDtype`` (:issue:`40149`) .. ---------------------------------------------------------------------------