diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index e770a9e3c47f8..7628c53cefa06 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -430,6 +430,7 @@ Backwards incompatible API changes - ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) - :func:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) - The column order of the resultant :class:`DataFrame` from :meth:`MultiIndex.to_frame` is now guaranteed to match the :attr:`MultiIndex.names` order. (:issue:`22420`) +- Incorrectly passing a :class:`DatetimeIndex` to :meth:`MultiIndex.from_tuples`, rather than a sequence of tuples, now raises a ``TypeError`` rather than a ``ValueError`` (:issue:`24024`) - :func:`pd.offsets.generate_range` argument ``time_rule`` has been removed; use ``offset`` instead (:issue:`24157`) Percentage change on groupby @@ -1368,6 +1369,7 @@ Datetimelike - Bug in :class:`DataFrame` with ``datetime64[ns]`` dtype subtracting ``np.datetime64`` object with non-nanosecond unit failing to convert to nanoseconds (:issue:`18874`, :issue:`22163`) - Bug in :class:`DataFrame` comparisons against ``Timestamp``-like objects failing to raise ``TypeError`` for inequality checks with mismatched types (:issue:`8932`, :issue:`22163`) - Bug in :class:`DataFrame` with mixed dtypes including ``datetime64[ns]`` incorrectly raising ``TypeError`` on equality comparisons (:issue:`13128`, :issue:`22163`) +- Bug in :attr:`DataFrame.values` returning a :class:`DatetimeIndex` for a single-column ``DataFrame`` with tz-aware datetime values. Now a 2-D :class:`numpy.ndarray` of :class:`Timestamp` objects is returned (:issue:`24024`) - Bug in :meth:`DataFrame.eq` comparison against ``NaT`` incorrectly returning ``True`` or ``NaN`` (:issue:`15697`, :issue:`22163`) - Bug in :class:`DatetimeIndex` subtraction that incorrectly failed to raise ``OverflowError`` (:issue:`22492`, :issue:`22508`) - Bug in :class:`DatetimeIndex` incorrectly allowing indexing with ``Timedelta`` object (:issue:`20464`) @@ -1384,6 +1386,7 @@ Datetimelike - Bug in :func:`period_range` ignoring the frequency of ``start`` and ``end`` when those are provided as :class:`Period` objects (:issue:`20535`). - Bug in :class:`PeriodIndex` with attribute ``freq.n`` greater than 1 where adding a :class:`DateOffset` object would return incorrect results (:issue:`23215`) - Bug in :class:`Series` that interpreted string indices as lists of characters when setting datetimelike values (:issue:`23451`) +- Bug in :class:`DataFrame` when creating a new column from an ndarray of :class:`Timestamp` objects with timezones creating an object-dtype column, rather than datetime with timezone (:issue:`23932`) - Bug in :class:`Timestamp` constructor which would drop the frequency of an input :class:`Timestamp` (:issue:`22311`) - Bug in :class:`DatetimeIndex` where calling ``np.array(dtindex, dtype=object)`` would incorrectly return an array of ``long`` objects (:issue:`23524`) - Bug in :class:`Index` where passing a timezone-aware :class:`DatetimeIndex` and `dtype=object` would incorrectly raise a ``ValueError`` (:issue:`23524`) diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index 9662c59dddf4c..d0caeb3333548 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -228,6 +228,11 @@ static PyObject *get_values(PyObject *obj) { PRINTMARK(); if (values && !PyArray_CheckExact(values)) { + + if (PyObject_HasAttrString(values, "to_numpy")) { + values = PyObject_CallMethod(values, "to_numpy", NULL); + } + if (PyObject_HasAttrString(values, "values")) { PyObject *subvals = get_values(values); PyErr_Clear(); @@ -279,8 +284,8 @@ static PyObject *get_values(PyObject *obj) { repr = PyString_FromString(""); } - PyErr_Format(PyExc_ValueError, "%s or %s are not JSON serializable yet", - PyString_AS_STRING(repr), PyString_AS_STRING(typeRepr)); + PyErr_Format(PyExc_ValueError, "%R or %R are not JSON serializable yet", + repr, typeRepr); Py_DECREF(repr); Py_DECREF(typeRepr); diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ab5621d857e89..d233e1d09a1e9 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -47,7 +47,7 @@ def cmp_method(self, other): if isinstance(other, ABCDataFrame): return NotImplemented - if isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries)): + if isinstance(other, (np.ndarray, ABCIndexClass, ABCSeries, cls)): if other.ndim > 0 and len(self) != len(other): raise ValueError('Lengths must match to compare') @@ -1162,9 +1162,10 @@ def _addsub_offset_array(self, other, op): left = lib.values_from_object(self.astype('O')) res_values = op(left, np.array(other)) + kwargs = {} if not is_period_dtype(self): - return type(self)(res_values, freq='infer') - return self._from_sequence(res_values) + kwargs['freq'] = 'infer' + return self._from_sequence(res_values, **kwargs) def _time_shift(self, periods, freq=None): """ diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 8b0565a36648f..f42930929747d 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -97,6 +97,9 @@ def _dt_array_cmp(cls, op): def wrapper(self, other): meth = getattr(dtl.DatetimeLikeArrayMixin, opname) + # TODO: return NotImplemented for Series / Index and let pandas unbox + # Right now, returning NotImplemented for Index fails because we + # go into the index implementation, which may be a bug? other = lib.item_from_zerodim(other) @@ -145,9 +148,16 @@ def wrapper(self, other): return ops.invalid_comparison(self, other, op) else: self._assert_tzawareness_compat(other) - if not hasattr(other, 'asi8'): - # ndarray, Series - other = type(self)(other) + if isinstance(other, (ABCIndexClass, ABCSeries)): + other = other.array + + if (is_datetime64_dtype(other) and + not is_datetime64_ns_dtype(other) or + not hasattr(other, 'asi8')): + # e.g. other.dtype == 'datetime64[s]' + # or an object-dtype ndarray + other = type(self)._from_sequence(other) + result = meth(self, other) o_mask = other._isnan @@ -171,10 +181,24 @@ class DatetimeArrayMixin(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps, dtl.DatelikeOps): """ - Assumes that subclass __new__/__init__ defines: - tz - _freq - _data + Pandas ExtensionArray for tz-naive or tz-aware datetime data. + + .. versionadded:: 0.24.0 + + Parameters + ---------- + values : Series, Index, DatetimeArray, ndarray + The datetime data. + + For DatetimeArray `values` (or a Series or Index boxing one), + `dtype` and `freq` will be extracted from `values`, with + precedence given to + + dtype : numpy.dtype or DatetimeTZDtype + Note that the only NumPy dtype allowed is 'datetime64[ns]'. + freq : str or Offset, optional + copy : bool, default False + Whether to copy the underlying array of values. """ _typ = "datetimearray" _scalar_type = Timestamp @@ -213,38 +237,84 @@ class DatetimeArrayMixin(dtl.DatetimeLikeArrayMixin, _dtype = None # type: Union[np.dtype, DatetimeTZDtype] _freq = None - @classmethod - def _simple_new(cls, values, freq=None, tz=None): - """ - we require the we have a dtype compat for the values - if we are passed a non-dtype compat, then coerce using the constructor - """ - assert isinstance(values, np.ndarray), type(values) + def __init__(self, values, dtype=_NS_DTYPE, freq=None, copy=False): + if isinstance(values, (ABCSeries, ABCIndexClass)): + values = values._values + + if isinstance(values, type(self)): + # validation + dtz = getattr(dtype, 'tz', None) + if dtz and values.tz is None: + dtype = DatetimeTZDtype(tz=dtype.tz) + elif dtz and values.tz: + if not timezones.tz_compare(dtz, values.tz): + msg = ( + "Timezone of the array and 'dtype' do not match. " + "'{}' != '{}'" + ) + raise TypeError(msg.format(dtz, values.tz)) + elif values.tz: + dtype = values.dtype + # freq = validate_values_freq(values, freq) + if freq is None: + freq = values.freq + values = values._data + + if not isinstance(values, np.ndarray): + msg = ( + "Unexpected type '{}'. 'values' must be a DatetimeArray " + "ndarray, or Series or Index containing one of those." + ) + raise ValueError(msg.format(type(values).__name__)) + if values.dtype == 'i8': # for compat with datetime/timedelta/period shared methods, # we can sometimes get here with int64 values. These represent # nanosecond UTC (or tz-naive) unix timestamps values = values.view(_NS_DTYPE) - assert values.dtype == 'M8[ns]', values.dtype + if values.dtype != _NS_DTYPE: + msg = ( + "The dtype of 'values' is incorrect. Must be 'datetime64[ns]'." + " Got {} instead." + ) + raise ValueError(msg.format(values.dtype)) - result = object.__new__(cls) - result._data = values - result._freq = freq - if tz is None: - dtype = _NS_DTYPE - else: - tz = timezones.maybe_get_tz(tz) - tz = timezones.tz_standardize(tz) - dtype = DatetimeTZDtype('ns', tz) - result._dtype = dtype - return result + dtype = pandas_dtype(dtype) + _validate_dt64_dtype(dtype) - def __new__(cls, values, freq=None, tz=None, dtype=None, copy=False, - dayfirst=False, yearfirst=False, ambiguous='raise'): - return cls._from_sequence( - values, freq=freq, tz=tz, dtype=dtype, copy=copy, - dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous) + if freq == "infer": + msg = ( + "Frequency inference not allowed in DatetimeArray.__init__. " + "Use 'pd.array()' instead." + ) + raise ValueError(msg) + + if copy: + values = values.copy() + if freq: + freq = to_offset(freq) + if getattr(dtype, 'tz', None): + # https://github.com/pandas-dev/pandas/issues/18595 + # Ensure that we have a standard timezone for pytz objects. + # Without this, things like adding an array of timedeltas and + # a tz-aware Timestamp (with a tz specific to its datetime) will + # be incorrect(ish?) for the array as a whole + dtype = DatetimeTZDtype(tz=timezones.tz_standardize(dtype.tz)) + + self._data = values + self._dtype = dtype + self._freq = freq + + @classmethod + def _simple_new(cls, values, freq=None, tz=None): + """ + we require the we have a dtype compat for the values + if we are passed a non-dtype compat, then coerce using the constructor + """ + dtype = DatetimeTZDtype(tz=tz) if tz else _NS_DTYPE + + return cls(values, freq=freq, dtype=dtype) @classmethod def _from_sequence(cls, data, dtype=None, copy=False, @@ -459,8 +529,7 @@ def __array__(self, dtype=None): elif is_int64_dtype(dtype): return self.asi8 - # TODO: warn that conversion may be lossy? - return self._data.view(np.ndarray) # follow Index.__array__ + return self._data def __iter__(self): """ @@ -519,7 +588,7 @@ def astype(self, dtype, copy=True): @Appender(dtl.DatetimeLikeArrayMixin._validate_fill_value.__doc__) def _validate_fill_value(self, fill_value): - if isna(fill_value): + if isna(fill_value) or fill_value == iNaT: fill_value = iNaT elif isinstance(fill_value, (datetime, np.datetime64)): self._assert_tzawareness_compat(fill_value) @@ -1574,6 +1643,9 @@ def sequence_to_dt64ns(data, dtype=None, copy=False, # if dtype has an embedded tz, capture it tz = validate_tz_from_dtype(dtype, tz) + if isinstance(data, ABCIndexClass): + data = data._data + # By this point we are assured to have either a numpy array or Index data, copy = maybe_convert_dtype(data, copy) @@ -1590,12 +1662,15 @@ def sequence_to_dt64ns(data, dtype=None, copy=False, data, dayfirst=dayfirst, yearfirst=yearfirst) tz = maybe_infer_tz(tz, inferred_tz) + # `data` may have originally been a Categorical[datetime64[ns, tz]], + # so we need to handle these types. if is_datetime64tz_dtype(data): + # DatetimeArray -> ndarray tz = maybe_infer_tz(tz, data.tz) result = data._data elif is_datetime64_dtype(data): - # tz-naive DatetimeArray/Index or ndarray[datetime64] + # tz-naive DatetimeArray or ndarray[datetime64] data = getattr(data, "_data", data) if data.dtype != _NS_DTYPE: data = conversion.ensure_datetime64ns(data) @@ -1750,7 +1825,7 @@ def maybe_convert_dtype(data, copy): # GH#18664 preserve tz in going DTI->Categorical->DTI # TODO: cases where we need to do another pass through this func, # e.g. the categories are timedelta64s - data = data.categories.take(data.codes, fill_value=NaT) + data = data.categories.take(data.codes, fill_value=NaT)._values copy = False elif is_extension_type(data) and not is_datetime64tz_dtype(data): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 45a6081093aed..70da02f2ba0a1 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -179,8 +179,7 @@ def __init__(self, values, freq=None, dtype=None, copy=False): @classmethod def _simple_new(cls, values, freq=None, **kwargs): - # TODO(DatetimeArray): remove once all constructors are aligned. - # alias from PeriodArray.__init__ + # alias for PeriodArray.__init__ return cls(values, freq=freq, **kwargs) @classmethod diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index 78570be8dc07f..b747e2b6b096b 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -33,6 +33,8 @@ from . import datetimelike as dtl +_BAD_DTYPE = "dtype {dtype} cannot be converted to timedelta64[ns]" + def _to_m8(key): """ @@ -142,25 +144,56 @@ def dtype(self): # Constructors _attributes = ["freq"] - @classmethod - def _simple_new(cls, values, freq=None, dtype=_TD_DTYPE): - # `dtype` is passed by _shallow_copy in corner cases, should always - # be timedelta64[ns] if present - assert dtype == _TD_DTYPE - assert isinstance(values, np.ndarray), type(values) + def __init__(self, values, dtype=_TD_DTYPE, freq=None, copy=False): + if isinstance(values, (ABCSeries, ABCIndexClass)): + values = values._values + + if isinstance(values, type(self)): + values, freq, freq_infer = extract_values_freq(values, freq) + + if not isinstance(values, np.ndarray): + msg = ( + "Unexpected type '{}'. 'values' must be a TimedeltaArray " + "ndarray, or Series or Index containing one of those." + ) + raise ValueError(msg.format(type(values).__name__)) if values.dtype == 'i8': - values = values.view('m8[ns]') + # for compat with datetime/timedelta/period shared methods, + # we can sometimes get here with int64 values. These represent + # nanosecond UTC (or tz-naive) unix timestamps + values = values.view(_TD_DTYPE) - assert values.dtype == 'm8[ns]' + if values.dtype != _TD_DTYPE: + raise TypeError(_BAD_DTYPE.format(dtype=values.dtype)) - result = object.__new__(cls) - result._data = values - result._freq = freq - return result + try: + dtype_mismatch = dtype != _TD_DTYPE + except TypeError: + raise TypeError(_BAD_DTYPE.format(dtype=dtype)) + else: + if dtype_mismatch: + raise TypeError(_BAD_DTYPE.format(dtype=dtype)) - def __new__(cls, values, freq=None, dtype=_TD_DTYPE, copy=False): - return cls._from_sequence(values, dtype=dtype, copy=copy, freq=freq) + if freq == "infer": + msg = ( + "Frequency inference not allowed in TimedeltaArray.__init__. " + "Use 'pd.array()' instead." + ) + raise ValueError(msg) + + if copy: + values = values.copy() + if freq: + freq = to_offset(freq) + + self._data = values + self._dtype = dtype + self._freq = freq + + @classmethod + def _simple_new(cls, values, freq=None, dtype=_TD_DTYPE): + return cls(values, dtype=dtype, freq=freq) @classmethod def _from_sequence(cls, data, dtype=_TD_DTYPE, copy=False, @@ -984,3 +1017,18 @@ def _generate_regular_range(start, end, periods, offset): data = np.arange(b, e, stride, dtype=np.int64) return data + + +def extract_values_freq(arr, freq): + # type: (TimedeltaArray, Offset) -> Tuple[ndarray, Offset, bool] + freq_infer = False + if freq is None: + freq = arr.freq + elif freq and arr.freq: + freq = to_offset(freq) + freq, freq_infer = dtl.validate_inferred_freq( + freq, arr.freq, + freq_infer=False + ) + values = arr._data + return values, freq, freq_infer diff --git a/pandas/core/base.py b/pandas/core/base.py index cc1bda620c215..c37ab48de7cb8 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -850,11 +850,9 @@ def array(self): """ result = self._values - if (is_datetime64_ns_dtype(result.dtype) or - is_datetime64tz_dtype(result.dtype)): + if is_datetime64_ns_dtype(result.dtype): from pandas.arrays import DatetimeArray result = DatetimeArray(result) - elif is_timedelta64_ns_dtype(result.dtype): from pandas.arrays import TimedeltaArray result = TimedeltaArray(result) @@ -950,14 +948,14 @@ def to_numpy(self, dtype=None, copy=False): array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'], dtype='datetime64[ns]') """ - if (is_extension_array_dtype(self.dtype) or - is_datetime64tz_dtype(self.dtype)): - # TODO(DatetimeArray): remove the second clause. - # TODO(GH-24345): Avoid potential double copy - result = np.asarray(self._values, dtype=dtype) - else: - result = self._values + if is_datetime64tz_dtype(self.dtype) and dtype is None: + # note: this is going to change very soon. + # I have a WIP PR making this unnecessary, but it's + # a bit out of scope for the DatetimeArray PR. + dtype = "object" + result = np.asarray(self._values, dtype=dtype) + # TODO(GH-24345): Avoid potential double copy if copy: result = result.copy() return result diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index eae9eb97f35fe..8f26f7ac209b1 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -20,7 +20,9 @@ pandas_dtype) from .dtypes import ( DatetimeTZDtype, ExtensionDtype, PandasExtensionDtype, PeriodDtype) -from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries +from .generic import ( + ABCDatetimeArray, ABCDatetimeIndex, ABCPeriodArray, ABCPeriodIndex, + ABCSeries) from .inference import is_list_like from .missing import isna, notna @@ -860,7 +862,9 @@ def maybe_infer_to_datetimelike(value, convert_dates=False): """ - if isinstance(value, (ABCDatetimeIndex, ABCPeriodIndex)): + # TODO: why not timedelta? + if isinstance(value, (ABCDatetimeIndex, ABCPeriodIndex, + ABCDatetimeArray, ABCPeriodArray)): return value elif isinstance(value, ABCSeries): if isinstance(value._values, ABCDatetimeIndex): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index e1141c6b6b3a8..293ce7d8e4aca 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -10,8 +10,7 @@ from pandas.core.dtypes.dtypes import ( CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, ExtensionDtype, - IntervalDtype, PandasExtensionDtype, PeriodDtype, _pandas_registry, - registry) + IntervalDtype, PandasExtensionDtype, PeriodDtype, registry) from pandas.core.dtypes.generic import ( ABCCategorical, ABCCategoricalIndex, ABCDateOffset, ABCDatetimeIndex, ABCIndexClass, ABCPeriodArray, ABCPeriodIndex, ABCSeries, ABCSparseArray, @@ -1984,7 +1983,7 @@ def pandas_dtype(dtype): return dtype # registered extension types - result = _pandas_registry.find(dtype) or registry.find(dtype) + result = registry.find(dtype) if result is not None: return result diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index 0501889d743d4..e6967ed2a4d3d 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -12,8 +12,8 @@ is_extension_array_dtype, is_interval_dtype, is_object_dtype, is_period_dtype, is_sparse, is_timedelta64_dtype) from pandas.core.dtypes.generic import ( - ABCDatetimeIndex, ABCPeriodIndex, ABCRangeIndex, ABCSparseDataFrame, - ABCTimedeltaIndex) + ABCDatetimeArray, ABCDatetimeIndex, ABCIndexClass, ABCPeriodIndex, + ABCRangeIndex, ABCSparseDataFrame, ABCTimedeltaIndex) from pandas import compat @@ -471,7 +471,15 @@ def _concat_datetimetz(to_concat, name=None): all inputs must be DatetimeIndex it is used in DatetimeIndex.append also """ - return to_concat[0]._concat_same_dtype(to_concat, name=name) + # Right now, internals will pass a List[DatetimeArray] here + # for reductions like quantile. I would like to disentangle + # all this before we get here. + sample = to_concat[0] + + if isinstance(sample, ABCIndexClass): + return sample._concat_same_dtype(to_concat, name=name) + elif isinstance(sample, ABCDatetimeArray): + return sample._concat_same_type(to_concat) def _concat_index_same_dtype(indexes, klass=None): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index e0d0cf3393dd5..9e2564c4f825b 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -479,7 +479,8 @@ def _is_boolean(self): return is_bool_dtype(self.categories) -class DatetimeTZDtype(PandasExtensionDtype): +@register_extension_dtype +class DatetimeTZDtype(PandasExtensionDtype, ExtensionDtype): """ A np.dtype duck-typed class, suitable for holding a custom datetime with tz @@ -493,6 +494,7 @@ class DatetimeTZDtype(PandasExtensionDtype): str = '|M8[ns]' num = 101 base = np.dtype('M8[ns]') + na_value = NaT _metadata = ('unit', 'tz') _match = re.compile(r"(datetime64|M8)\[(?P.+), (?P.+)\]") _cache = {} @@ -570,8 +572,8 @@ def construct_array_type(cls): ------- type """ - from pandas import DatetimeIndex - return DatetimeIndex + from pandas.core.arrays import DatetimeArrayMixin + return DatetimeArrayMixin @classmethod def construct_from_string(cls, string): @@ -885,10 +887,3 @@ def is_dtype(cls, dtype): else: return False return super(IntervalDtype, cls).is_dtype(dtype) - - -# TODO(Extension): remove the second registry once all internal extension -# dtypes are real extension dtypes. -_pandas_registry = Registry() - -_pandas_registry.register(DatetimeTZDtype) diff --git a/pandas/core/dtypes/generic.py b/pandas/core/dtypes/generic.py index bbc447d6fa0da..134ec95729833 100644 --- a/pandas/core/dtypes/generic.py +++ b/pandas/core/dtypes/generic.py @@ -67,7 +67,8 @@ def _check(cls, inst): ("extension", "categorical", "periodarray", - "npy_extension", + "datetimearray", + "timedeltaarray", )) ABCPandasArray = create_pandas_abc_type("ABCPandasArray", "_typ", diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ab5e2a14c7783..d6aa3117570af 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -78,6 +78,9 @@ from pandas.core import ops from pandas.core.accessor import CachedAccessor from pandas.core.arrays import Categorical, ExtensionArray +from pandas.core.arrays.datetimelike import ( + DatetimeLikeArrayMixin as DatetimeLikeArray +) from pandas.core.config import get_option from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import (Index, MultiIndex, ensure_index, @@ -4356,9 +4359,25 @@ def _maybe_casted_values(index, labels=None): values.fill(np.nan) else: values = values.take(labels) + + # TODO(https://github.com/pandas-dev/pandas/issues/24206) + # Push this into maybe_upcast_putmask? + # We can't pass EAs there right now. Looks a bit + # complicated. + # So we unbox the ndarray_values, op, re-box. + values_type = type(values) + values_dtype = values.dtype + + if issubclass(values_type, DatetimeLikeArray): + values = values._data + if mask.any(): values, changed = maybe_upcast_putmask( values, mask, np.nan) + + if issubclass(values_type, DatetimeLikeArray): + values = values_type(values, dtype=values_dtype) + return values new_index = ibase.default_index(len(new_obj)) @@ -5314,7 +5333,6 @@ def extract_values(arr): arr = arr._values if needs_i8_conversion(arr): - # TODO(DatetimelikeArray): just use .asi8 if is_extension_array_dtype(arr.dtype): arr = arr.asi8 else: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1380c5caed1c9..a26daba49f5d1 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -25,7 +25,7 @@ is_signed_integer_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCDateOffset, ABCDatetimeIndex, ABCIndexClass, + ABCDataFrame, ABCDateOffset, ABCDatetimeArray, ABCIndexClass, ABCMultiIndex, ABCPandasArray, ABCPeriodIndex, ABCSeries, ABCTimedeltaArray, ABCTimedeltaIndex) from pandas.core.dtypes.missing import array_equivalent, isna @@ -568,9 +568,9 @@ def _shallow_copy(self, values=None, **kwargs): if not len(values) and 'dtype' not in kwargs: attributes['dtype'] = self.dtype - # _simple_new expects an ndarray - values = getattr(values, 'values', values) - if isinstance(values, ABCDatetimeIndex): + # _simple_new expects an the type of self._data + values = getattr(values, '_values', values) + if isinstance(values, ABCDatetimeArray): # `self.values` returns `self` for tz-aware, so we need to unwrap # more specifically values = values.asi8 diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 50b2413167b32..daca4b5116027 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -71,6 +71,10 @@ class DatetimeIndexOpsMixin(ExtensionOpsMixin): _maybe_mask_results = ea_passthrough("_maybe_mask_results") __iter__ = ea_passthrough("__iter__") + @property + def _eadata(self): + return self._data + @property def freq(self): """ @@ -166,12 +170,15 @@ def _join_i8_wrapper(joinf, dtype, with_indexers=True): """ Create the join wrapper methods. """ + from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin @staticmethod def wrapper(left, right): - if isinstance(left, (np.ndarray, ABCIndex, ABCSeries)): + if isinstance(left, (np.ndarray, ABCIndex, ABCSeries, + DatetimeLikeArrayMixin)): left = left.view('i8') - if isinstance(right, (np.ndarray, ABCIndex, ABCSeries)): + if isinstance(right, (np.ndarray, ABCIndex, ABCSeries, + DatetimeLikeArrayMixin)): right = right.view('i8') results = joinf(left, right) if with_indexers: @@ -195,25 +202,27 @@ def _evaluate_compare(self, other, op): def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise', from_utc=False): # See DatetimeLikeArrayMixin._ensure_localized.__doc__ - if getattr(self, 'tz', None): # ensure_localized is only relevant for tz-aware DTI - from pandas.core.arrays import DatetimeArrayMixin as DatetimeArray - dtarr = DatetimeArray(self) - result = dtarr._ensure_localized(arg, - ambiguous=ambiguous, - nonexistent=nonexistent, - from_utc=from_utc) - return type(self)(result, name=self.name) + result = self._data._ensure_localized(arg, + ambiguous=ambiguous, + nonexistent=nonexistent, + from_utc=from_utc) + return type(self)._simple_new(result, name=self.name) return arg def _box_values_as_index(self): """ Return object Index which contains boxed values. """ + # XXX: this is broken (not called) for PeriodIndex, which doesn't + # define _box_values AFAICT from pandas.core.index import Index return Index(self._box_values(self.asi8), name=self.name, dtype=object) + def _box_values(self, values): + return self._data._box_values(values) + @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) def __contains__(self, key): try: @@ -600,14 +609,8 @@ def _concat_same_dtype(self, to_concat, name): if not is_period_dtype(self): # reset freq attribs['freq'] = None - # TODO(DatetimeArray) - # - remove the .asi8 here - # - remove the _maybe_box_as_values - # - combine with the `else` block - new_data = self._eadata._concat_same_type(to_concat).asi8 - else: - new_data = type(self._values)._concat_same_type(to_concat) + new_data = type(self._values)._concat_same_type(to_concat).asi8 return self._simple_new(new_data, **attribs) def _maybe_box_as_values(self, values, **attribs): @@ -631,11 +634,6 @@ def astype(self, dtype, copy=True): return Index(new_values, dtype=new_values.dtype, name=self.name, copy=False) - @Appender(DatetimeLikeArrayMixin._time_shift.__doc__) - def _time_shift(self, periods, freq=None): - result = self._eadata._time_shift(periods, freq=freq) - return type(self)(result, name=self.name) - @deprecate_kwarg(old_arg_name='n', new_arg_name='periods') def shift(self, periods, freq=None): """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 690a3db28fe83..a6a910f66359c 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -17,11 +17,12 @@ _NS_DTYPE, ensure_int64, is_float, is_integer, is_list_like, is_scalar, is_string_like) import pandas.core.dtypes.concat as _concat +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna from pandas.core.accessor import delegate_names from pandas.core.arrays.datetimes import ( - DatetimeArrayMixin as DatetimeArray, _to_m8) + DatetimeArrayMixin as DatetimeArray, _to_m8, validate_tz_from_dtype) from pandas.core.base import _shared_docs import pandas.core.common as com from pandas.core.indexes.base import Index @@ -40,10 +41,6 @@ def _new_DatetimeIndex(cls, d): """ This is called upon unpickling, rather than the default which doesn't have arguments and breaks __new__ """ - # data are already in UTC - # so need to localize - tz = d.pop('tz', None) - if "data" in d and not isinstance(d["data"], DatetimeIndex): # Avoid need to verify integrity by calling simple_new directly data = d.pop("data") @@ -56,8 +53,6 @@ def _new_DatetimeIndex(cls, d): warnings.simplefilter("ignore") result = cls.__new__(cls, verify_integrity=False, **d) - if tz is not None: - result = result.tz_localize('UTC').tz_convert(tz) return result @@ -306,7 +301,7 @@ def __new__(cls, data=None, data, dtype=dtype, copy=copy, tz=tz, freq=freq, dayfirst=dayfirst, yearfirst=yearfirst, ambiguous=ambiguous) - subarr = cls._simple_new(dtarr._data, name=name, + subarr = cls._simple_new(dtarr, name=name, freq=dtarr.freq, tz=dtarr.tz) return subarr @@ -317,20 +312,28 @@ def _simple_new(cls, values, name=None, freq=None, tz=None, dtype=None): if we are passed a non-dtype compat, then coerce using the constructor """ if isinstance(values, DatetimeArray): - values = DatetimeArray(values, freq=freq, tz=tz, dtype=dtype) + if tz: + tz = validate_tz_from_dtype(dtype, tz) + dtype = DatetimeTZDtype(tz=tz) + elif dtype is None: + dtype = _NS_DTYPE + + values = DatetimeArray(values, freq=freq, dtype=dtype) tz = values.tz freq = values.freq values = values._data # DatetimeArray._simple_new will accept either i8 or M8[ns] dtypes - assert isinstance(values, np.ndarray), type(values) - + if isinstance(values, DatetimeIndex): + values = values._data dtarr = DatetimeArray._simple_new(values, freq=freq, tz=tz) + assert isinstance(dtarr, DatetimeArray) + result = object.__new__(cls) - result._eadata = dtarr + result._data = dtarr result.name = name # For groupby perf. See note in indexes/base about _index_data - result._index_data = result._data + result._index_data = dtarr._data result._reset_identity() return result @@ -341,13 +344,17 @@ def dtype(self): return self._eadata.dtype @property - def _values(self): - # tz-naive -> ndarray - # tz-aware -> DatetimeIndex - if self.tz is not None: - return self._eadata - else: - return self.values + def tz(self): + # GH 18595 + return self._eadata.tz + + @tz.setter + def tz(self, value): + # GH 3746: Prevent localizing or converting the index by setting tz + raise AttributeError("Cannot directly set timezone. Use tz_localize() " + "or tz_convert() as appropriate") + + tzinfo = tz @property def size(self): @@ -418,7 +425,7 @@ def __setstate__(self, state): np.ndarray.__setstate__(data, state) dtarr = DatetimeArray(data) - self._eadata = dtarr + self._data = dtarr self._reset_identity() else: @@ -596,8 +603,6 @@ def _fast_union(self, other): def _wrap_setop_result(self, other, result): name = get_op_result_name(self, other) - if not timezones.tz_compare(self.tz, other.tz): - raise ValueError('Passed item and index have different timezone') return self._shallow_copy(result, name=name, freq=None, tz=self.tz) def intersection(self, other): @@ -1128,22 +1133,6 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): # -------------------------------------------------------------------- # Wrapping DatetimeArray - @property - def _data(self): - return self._eadata._data - - @property - def tz(self): - # GH#18595 - return self._eadata.tz - - @tz.setter - def tz(self, value): - # GH#3746; DatetimeArray will raise to disallow setting - self._eadata.tz = value - - tzinfo = tz - # Compat for frequency inference, see GH#23789 _is_monotonic_increasing = Index.is_monotonic_increasing _is_monotonic_decreasing = Index.is_monotonic_decreasing @@ -1536,7 +1525,7 @@ def date_range(start=None, end=None, periods=None, freq=None, tz=None, freq=freq, tz=tz, normalize=normalize, closed=closed, **kwargs) return DatetimeIndex._simple_new( - dtarr._data, tz=dtarr.tz, freq=dtarr.freq, name=name) + dtarr, tz=dtarr.tz, freq=dtarr.freq, name=name) def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 4bd8f7407500b..5bc76ed210edb 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -282,10 +282,6 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): # ------------------------------------------------------------------------ # Data - @property - def _eadata(self): - return self._data - @property def values(self): return np.asarray(self) @@ -877,12 +873,6 @@ def __setstate__(self, state): _unpickle_compat = __setstate__ - def view(self, dtype=None, type=None): - # TODO(DatetimeArray): remove - if dtype is None or dtype is __builtins__['type'](self): - return self - return self._ndarray_values.view(dtype=dtype) - @property def flags(self): """ return the ndarray.flags for the underlying data """ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 0798dd6eee0c9..3a3b9ed97c8fe 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -209,7 +209,13 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, 'collection of some kind, {data} was passed' .format(cls=cls.__name__, data=repr(data))) - if isinstance(data, TimedeltaIndex) and freq is None and name is None: + if isinstance(data, TimedeltaArray): + if copy: + data = data.copy() + return cls._simple_new(data, name=name, freq=freq) + + if (isinstance(data, TimedeltaIndex) and + freq is None and name is None): if copy: return data.copy() else: @@ -225,17 +231,17 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): # `dtype` is passed by _shallow_copy in corner cases, should always # be timedelta64[ns] if present - assert dtype == _TD_DTYPE - - assert isinstance(values, np.ndarray), type(values) - if values.dtype == 'i8': - values = values.view('m8[ns]') + if not isinstance(values, TimedeltaArray): + values = TimedeltaArray._simple_new(values, dtype=dtype, + freq=freq) + assert isinstance(values, TimedeltaArray), type(values) + assert dtype == _TD_DTYPE, dtype assert values.dtype == 'm8[ns]', values.dtype freq = to_offset(freq) tdarr = TimedeltaArray._simple_new(values, freq=freq) result = object.__new__(cls) - result._eadata = tdarr + result._data = tdarr result.name = name # For groupby perf. See note in indexes/base about _index_data result._index_data = tdarr._data @@ -278,10 +284,6 @@ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): # ------------------------------------------------------------------- # Wrapping TimedeltaArray - @property - def _data(self): - return self._eadata._data - __mul__ = _make_wrapped_arith_op("__mul__") __rmul__ = _make_wrapped_arith_op("__rmul__") __floordiv__ = _make_wrapped_arith_op("__floordiv__") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 346f56968c963..7845a62bb7edb 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -35,11 +35,11 @@ import pandas.core.algorithms as algos from pandas.core.arrays import ( - Categorical, DatetimeArrayMixin as DatetimeArray, ExtensionArray) + Categorical, DatetimeArrayMixin as DatetimeArray, ExtensionArray, + TimedeltaArrayMixin as TimedeltaArray) from pandas.core.base import PandasObject import pandas.core.common as com from pandas.core.indexes.datetimes import DatetimeIndex -from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexing import check_setitem_lengths from pandas.core.internals.arrays import extract_array import pandas.core.missing as missing @@ -2169,7 +2169,7 @@ class DatetimeLikeBlockMixin(object): @property def _holder(self): - return DatetimeIndex + return DatetimeArray @property def _na_value(self): @@ -2179,15 +2179,32 @@ def _na_value(self): def fill_value(self): return tslibs.iNaT + def to_dense(self): + # TODO(DatetimeBlock): remove + return np.asarray(self.values) + def get_values(self, dtype=None): """ return object dtype as boxed values, such as Timestamps/Timedelta """ if is_object_dtype(dtype): - return lib.map_infer(self.values.ravel(), - self._box_func).reshape(self.values.shape) + values = self.values + + if self.ndim > 1: + values = values.ravel() + + values = lib.map_infer(values, self._box_func) + + if self.ndim > 1: + values = values.reshape(self.values.shape) + + return values return self.values + @property + def asi8(self): + return self.values.view('i8') + class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): __slots__ = () @@ -2198,13 +2215,15 @@ class TimeDeltaBlock(DatetimeLikeBlockMixin, IntBlock): def __init__(self, values, placement, ndim=None): if values.dtype != _TD_DTYPE: values = conversion.ensure_timedelta64ns(values) - + if isinstance(values, TimedeltaArray): + values = values._data + assert isinstance(values, np.ndarray), type(values) super(TimeDeltaBlock, self).__init__(values, placement=placement, ndim=ndim) @property def _holder(self): - return TimedeltaIndex + return TimedeltaArray @property def _box_func(self): @@ -2299,6 +2318,9 @@ def to_native_types(self, slicer=None, na_rep=None, quoting=None, dtype=object) return rvalues + def external_values(self, dtype=None): + return np.asarray(self.values.astype("timedelta64[ns]", copy=False)) + class BoolBlock(NumericBlock): __slots__ = () @@ -2771,6 +2793,11 @@ def _maybe_coerce_values(self, values): """ if values.dtype != _NS_DTYPE: values = conversion.ensure_datetime64ns(values) + + if isinstance(values, DatetimeArray): + values = values._data + + assert isinstance(values, np.ndarray), type(values) return values def _astype(self, dtype, **kwargs): @@ -2887,12 +2914,15 @@ def set(self, locs, values, check=False): self.values[locs] = values + def external_values(self): + return np.asarray(self.values.astype('datetime64[ns]', copy=False)) + -class DatetimeTZBlock(NonConsolidatableMixIn, DatetimeBlock): +class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): """ implement a datetime64 block with a tz attribute """ __slots__ = () - _concatenator = staticmethod(_concat._concat_datetime) is_datetimetz = True + is_extension = True def __init__(self, values, placement, ndim=2, dtype=None): # XXX: This will end up calling _maybe_coerce_values twice @@ -2907,6 +2937,10 @@ def __init__(self, values, placement, ndim=2, dtype=None): super(DatetimeTZBlock, self).__init__(values, placement=placement, ndim=ndim) + @property + def _holder(self): + return DatetimeArray + def _maybe_coerce_values(self, values, dtype=None): """Input validation for values passed to __init__. Ensure that we have datetime64TZ, coercing if necessary. @@ -2928,7 +2962,7 @@ def _maybe_coerce_values(self, values, dtype=None): if dtype is not None: if isinstance(dtype, compat.string_types): dtype = DatetimeTZDtype.construct_from_string(dtype) - values = values._shallow_copy(tz=dtype.tz) + values = type(values)(values, dtype=dtype) if values.tz is None: raise ValueError("cannot create a DatetimeTZBlock without a tz") @@ -2939,7 +2973,7 @@ def _maybe_coerce_values(self, values, dtype=None): def is_view(self): """ return a boolean if I am possibly a view """ # check the ndarray values of the DatetimeIndex values - return self.values.values.base is not None + return self.values._data.base is not None def copy(self, deep=True): """ copy constructor """ @@ -2948,18 +2982,39 @@ def copy(self, deep=True): values = values.copy(deep=True) return self.make_block_same_class(values) - def external_values(self): - """ we internally represent the data as a DatetimeIndex, but for - external compat with ndarray, export as a ndarray of Timestamps + def get_values(self, dtype=None): """ - return self.values.astype('datetime64[ns]').values + Returns an ndarray of values. - def get_values(self, dtype=None): - # return object dtype as Timestamps with the zones + Parameters + ---------- + dtype : np.dtype + Only `object`-like dtypes are respected here (not sure + why). + + Returns + ------- + values : ndarray + When ``dtype=object``, then and object-dtype ndarray of + boxed values is returned. Otherwise, an M8[ns] ndarray + is returned. + + DatetimeArray is always 1-d. ``get_values`` will reshape + the return value to be the same dimensionality as the + block. + """ + values = self.values if is_object_dtype(dtype): - return lib.map_infer( - self.values.ravel(), self._box_func).reshape(self.values.shape) - return self.values + values = values._box_values(values._data) + + values = np.asarray(values) + + if self.ndim == 2: + # Ensure that our shape is correct for DataFrame. + # ExtensionArrays are always 1-D, even in a DataFrame when + # the analogous NumPy-backed column would be a 2-D ndarray. + values = values.reshape(1, -1) + return values def _slice(self, slicer): """ return a slice of my values """ @@ -2984,13 +3039,17 @@ def _try_coerce_args(self, values, other): base-type values, base-type other """ # asi8 is a view, needs copy - values = _block_shape(values.asi8, ndim=self.ndim) + values = _block_shape(values.view("i8"), ndim=self.ndim) if isinstance(other, ABCSeries): other = self._holder(other) if isinstance(other, bool): raise TypeError + elif is_datetime64_dtype(other): + # add the tz back + other = self._holder(other, dtype=self.dtype) + elif (is_null_datelike_scalar(other) or (lib.is_scalar(other) and isna(other))): other = tslibs.iNaT @@ -3021,11 +3080,12 @@ def _try_coerce_result(self, result): result = tslibs.Timestamp(result, tz=self.values.tz) if isinstance(result, np.ndarray): # allow passing of > 1dim if its trivial + if result.ndim > 1: result = result.reshape(np.prod(result.shape)) - # GH#24096 new values invalidates a frequency - result = self.values._shallow_copy(result, freq=None) + result = self._holder._simple_new(result, freq=None, + tz=self.values.tz) return result @@ -3033,32 +3093,6 @@ def _try_coerce_result(self, result): def _box_func(self): return lambda x: tslibs.Timestamp(x, tz=self.dtype.tz) - def shift(self, periods, axis=0, fill_value=None): - """ shift the block by periods """ - - # think about moving this to the DatetimeIndex. This is a non-freq - # (number of periods) shift ### - - N = len(self) - indexer = np.zeros(N, dtype=int) - if periods > 0: - indexer[periods:] = np.arange(N - periods) - else: - indexer[:periods] = np.arange(-periods, N) - - new_values = self.values.asi8.take(indexer) - - if isna(fill_value): - fill_value = tslibs.iNaT - if periods > 0: - new_values[:periods] = fill_value - else: - new_values[periods:] = fill_value - - new_values = self.values._shallow_copy(new_values) - return [self.make_block_same_class(new_values, - placement=self.mgr_locs)] - def diff(self, n, axis=0): """1st discrete difference @@ -3088,14 +3122,44 @@ def diff(self, n, axis=0): return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] def concat_same_type(self, to_concat, placement=None): - """ - Concatenate list of single blocks of the same type. - """ - values = self._concatenator([blk.values for blk in to_concat], - axis=self.ndim - 1) - # not using self.make_block_same_class as values can be non-tz dtype - return make_block( - values, placement=placement or slice(0, len(values), 1)) + # need to handle concat([tz1, tz2]) here, since DatetimeArray + # only handles cases where all the tzs are the same. + # Instead of placing the condition here, it could also go into the + # is_uniform_join_units check, but I'm not sure what is better. + if len({x.dtype for x in to_concat}) > 1: + values = _concat._concat_datetime([x.values for x in to_concat]) + placement = placement or slice(0, len(values), 1) + + if self.ndim > 1: + values = np.atleast_2d(values) + return ObjectBlock(values, ndim=self.ndim, placement=placement) + return super(DatetimeTZBlock, self).concat_same_type(to_concat, + placement) + + def fillna(self, value, limit=None, inplace=False, downcast=None): + # We support filling a DatetimeTZ with a `value` whose timezone + # is different by coercing to object. + try: + return super(DatetimeTZBlock, self).fillna( + value, limit, inplace, downcast + ) + except (ValueError, TypeError): + # different timezones, or a non-tz + return self.astype(object).fillna( + value, limit=limit, inplace=inplace, downcast=downcast + ) + + def setitem(self, indexer, value): + # https://github.com/pandas-dev/pandas/issues/24020 + # Need a dedicated setitem until #24020 (type promotion in setitem + # for extension arrays) is designed and implemented. + try: + return super(DatetimeTZBlock, self).setitem(indexer, value) + except (ValueError, TypeError): + newb = make_block(self.values.astype(object), + placement=self.mgr_locs, + klass=ObjectBlock,) + return newb.setitem(indexer, value) # ----------------------------------------------------------------- diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 2441c64518d59..067b95f9d8847 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -183,8 +183,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): is_datetime64tz_dtype(empty_dtype)): if self.block is None: array = empty_dtype.construct_array_type() - missing_arr = array([fill_value], dtype=empty_dtype) - return missing_arr.repeat(self.shape[1]) + return array(np.full(self.shape[1], fill_value), + dtype=empty_dtype) pass elif getattr(self.block, 'is_categorical', False): pass diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index b18b966406bbb..b3c893c7d84be 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -589,7 +589,7 @@ def sanitize_array(data, index, dtype=None, copy=False, # everything else in this block must also handle ndarray's, # becuase we've unwrapped PandasArray into an ndarray. - if dtype is not None and not data.dtype.is_dtype(dtype): + if dtype is not None: subarr = data.astype(dtype) if copy: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f1372a1fe2f51..d50f9c3e65ebd 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -750,8 +750,13 @@ def as_array(self, transpose=False, items=None): else: mgr = self - if self._is_single_block or not self.is_mixed_type: - arr = mgr.blocks[0].get_values() + if self._is_single_block and mgr.blocks[0].is_datetimetz: + # TODO(Block.get_values): Make DatetimeTZBlock.get_values + # always be object dtype. Some callers seem to want the + # DatetimeArray (previously DTI) + arr = mgr.blocks[0].get_values(dtype=object) + elif self._is_single_block or not self.is_mixed_type: + arr = np.asarray(mgr.blocks[0].get_values()) else: arr = mgr._interleave() diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index a894b8788f8d8..cec594a13b3d3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -2713,10 +2713,11 @@ def write_array(self, key, value, items=None): raise NotImplementedError('Cannot store a category dtype in ' 'a HDF5 dataset that uses format=' '"fixed". Use format="table".') - if not empty_array: - value = value.T - transposed = True + if hasattr(value, 'T'): + # ExtensionArrays (1d) may not have transpose. + value = value.T + transposed = True if self._filters is not None: atom = None diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py index dea4940eb3180..12ed174d6cc53 100644 --- a/pandas/tests/arithmetic/test_timedelta64.py +++ b/pandas/tests/arithmetic/test_timedelta64.py @@ -1476,7 +1476,7 @@ def test_tdi_rmul_arraylike(self, other, box_with_array): tdi = TimedeltaIndex(['1 Day'] * 10) expected = timedelta_range('1 days', '10 days') - expected._eadata._freq = None + expected._eadata.freq = None tdi = tm.box_expected(tdi, box) expected = tm.box_expected(expected, xbox) diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 48b64c2968219..1375969c961fd 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -28,13 +28,26 @@ def test_mismatched_timezone_raises(self): arr = DatetimeArray(np.array(['2000-01-01T06:00:00'], dtype='M8[ns]'), dtype=DatetimeTZDtype(tz='US/Central')) dtype = DatetimeTZDtype(tz='US/Eastern') - with pytest.raises(TypeError, match='data is already tz-aware'): + with pytest.raises(TypeError, match='Timezone of the array'): DatetimeArray(arr, dtype=dtype) + def test_non_array_raises(self): + with pytest.raises(ValueError, match='list'): + DatetimeArray([1, 2, 3]) + + def test_other_type_raises(self): + with pytest.raises(ValueError, + match="The dtype of 'values' is incorrect.*bool"): + DatetimeArray(np.array([1, 2, 3], dtype='bool')) + def test_incorrect_dtype_raises(self): with pytest.raises(ValueError, match="Unexpected value for 'dtype'."): DatetimeArray(np.array([1, 2, 3], dtype='i8'), dtype='category') + def test_freq_infer_raises(self): + with pytest.raises(ValueError, match='Frequency inference'): + DatetimeArray(np.array([1, 2, 3], dtype='i8'), freq="infer") + def test_copy(self): data = np.array([1, 2, 3], dtype='M8[ns]') arr = DatetimeArray(data, copy=False) @@ -128,7 +141,7 @@ def test_repeat_preserves_tz(self): repeated = arr.repeat([1, 1]) # preserves tz and values, but not freq - expected = DatetimeArray(arr.asi8, freq=None, tz=arr.tz) + expected = DatetimeArray(arr.asi8, freq=None, dtype=arr.dtype) tm.assert_equal(repeated, expected) def test_value_counts_preserves_tz(self): @@ -153,8 +166,10 @@ def test_fillna_preserves_tz(self, method): arr[2] = pd.NaT fill_val = dti[1] if method == 'pad' else dti[3] - expected = DatetimeArray([dti[0], dti[1], fill_val, dti[3], dti[4]], - freq=None, tz='US/Central') + expected = DatetimeArray._from_sequence( + [dti[0], dti[1], fill_val, dti[3], dti[4]], + freq=None, tz='US/Central' + ) result = arr.fillna(method=method) tm.assert_extension_array_equal(result, expected) diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py index 1221d920f2e91..08ef27297cca5 100644 --- a/pandas/tests/arrays/test_timedeltas.py +++ b/pandas/tests/arrays/test_timedeltas.py @@ -9,6 +9,10 @@ class TestTimedeltaArrayConstructor(object): + def test_non_array_raises(self): + with pytest.raises(ValueError, match='list'): + TimedeltaArray([1, 2, 3]) + def test_other_type_raises(self): with pytest.raises(TypeError, match="dtype bool cannot be converted"): @@ -17,13 +21,15 @@ def test_other_type_raises(self): def test_incorrect_dtype_raises(self): # TODO: why TypeError for 'category' but ValueError for i8? with pytest.raises(TypeError, - match='data type "category" not understood'): + match=r'category cannot be converted ' + r'to timedelta64\[ns\]'): TimedeltaArray(np.array([1, 2, 3], dtype='i8'), dtype='category') - with pytest.raises(ValueError, - match=r"Only timedelta64\[ns\] dtype is valid"): + with pytest.raises(TypeError, + match=r"dtype int64 cannot be converted " + r"to timedelta64\[ns\]"): TimedeltaArray(np.array([1, 2, 3], dtype='i8'), - dtype=np.dtype(int)) + dtype=np.dtype("int64")) def test_copy(self): data = np.array([1, 2, 3], dtype='m8[ns]') @@ -40,8 +46,6 @@ def test_from_sequence_dtype(self): msg = r"Only timedelta64\[ns\] dtype is valid" with pytest.raises(ValueError, match=msg): TimedeltaArray._from_sequence([], dtype=object) - with pytest.raises(ValueError, match=msg): - TimedeltaArray([], dtype=object) def test_abs(self): vals = np.array([-3600 * 10**9, 'NaT', 7200 * 10**9], dtype='m8[ns]') diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 77dc04e9453a9..aa29473ddf130 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -9,7 +9,7 @@ from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, PeriodDtype, - IntervalDtype, CategoricalDtype, registry, _pandas_registry) + IntervalDtype, CategoricalDtype, registry) from pandas.core.dtypes.common import ( is_categorical_dtype, is_categorical, is_datetime64tz_dtype, is_datetimetz, @@ -798,20 +798,13 @@ def test_update_dtype_errors(self, bad_dtype): @pytest.mark.parametrize('dtype', [ CategoricalDtype, IntervalDtype, + DatetimeTZDtype, PeriodDtype, ]) def test_registry(dtype): assert dtype in registry.dtypes -@pytest.mark.parametrize('dtype', [ - DatetimeTZDtype, -]) -def test_pandas_registry(dtype): - assert dtype not in registry.dtypes - assert dtype in _pandas_registry.dtypes - - @pytest.mark.parametrize('dtype, expected', [ ('int64', None), ('interval', IntervalDtype()), @@ -819,18 +812,12 @@ def test_pandas_registry(dtype): ('interval[datetime64[ns]]', IntervalDtype('datetime64[ns]')), ('period[D]', PeriodDtype('D')), ('category', CategoricalDtype()), + ('datetime64[ns, US/Eastern]', DatetimeTZDtype('ns', 'US/Eastern')), ]) def test_registry_find(dtype, expected): assert registry.find(dtype) == expected -@pytest.mark.parametrize('dtype, expected', [ - ('datetime64[ns, US/Eastern]', DatetimeTZDtype('ns', 'US/Eastern')), -]) -def test_pandas_registry_find(dtype, expected): - assert _pandas_registry.find(dtype) == expected - - @pytest.mark.parametrize('dtype, expected', [ (str, False), (int, False), diff --git a/pandas/tests/extension/arrow/bool.py b/pandas/tests/extension/arrow/bool.py index f8e357e162232..025c4cacd8fa1 100644 --- a/pandas/tests/extension/arrow/bool.py +++ b/pandas/tests/extension/arrow/bool.py @@ -76,6 +76,14 @@ def __getitem__(self, item): def __len__(self): return len(self._data) + def astype(self, dtype, copy=True): + # needed to fix this astype for the Series constructor. + if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: + if copy: + return self.copy() + return self + return super(ArrowBoolArray, self).astype(dtype, copy) + @property def dtype(self): return self._dtype @@ -102,9 +110,9 @@ def take(self, indices, allow_fill=False, fill_value=None): def copy(self, deep=False): if deep: - return copy.deepcopy(self._data) + return type(self)(copy.deepcopy(self._data)) else: - return copy.copy(self._data) + return type(self)(copy.copy(self._data)) def _concat_same_type(cls, to_concat): chunks = list(itertools.chain.from_iterable(x._data.chunks diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index d58b7ddf29123..bd50584406312 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -157,6 +157,12 @@ def astype(self, dtype, copy=True): # NumPy has issues when all the dicts are the same length. # np.array([UserDict(...), UserDict(...)]) fails, # but np.array([{...}, {...}]) works, so cast. + + # needed to add this check for the Series constructor + if isinstance(dtype, type(self.dtype)) and dtype == self.dtype: + if copy: + return self.copy() + return self return np.array([dict(x) for x in self], dtype=dtype, copy=copy) def unique(self): diff --git a/pandas/tests/extension/test_common.py b/pandas/tests/extension/test_common.py index 2bc4bf5df2298..db3f3b80bca6b 100644 --- a/pandas/tests/extension/test_common.py +++ b/pandas/tests/extension/test_common.py @@ -77,14 +77,6 @@ def test_astype_no_copy(): assert arr is not result -@pytest.mark.parametrize('dtype', [ - dtypes.DatetimeTZDtype('ns', 'US/Central'), -]) -def test_is_not_extension_array_dtype(dtype): - assert not isinstance(dtype, dtypes.ExtensionDtype) - assert not is_extension_array_dtype(dtype) - - @pytest.mark.parametrize('dtype', [ dtypes.CategoricalDtype(), dtypes.IntervalDtype(), diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py new file mode 100644 index 0000000000000..7c4491d6edbcf --- /dev/null +++ b/pandas/tests/extension/test_datetime.py @@ -0,0 +1,237 @@ +import numpy as np +import pytest + +from pandas.core.dtypes.dtypes import DatetimeTZDtype + +import pandas as pd +from pandas.core.arrays import DatetimeArrayMixin as DatetimeArray +from pandas.tests.extension import base + + +@pytest.fixture(params=["US/Central"]) +def dtype(request): + return DatetimeTZDtype(unit="ns", tz=request.param) + + +@pytest.fixture +def data(dtype): + data = DatetimeArray(pd.date_range("2000", periods=100, tz=dtype.tz), + dtype=dtype) + return data + + +@pytest.fixture +def data_missing(dtype): + return DatetimeArray( + np.array(['NaT', '2000-01-01'], dtype='datetime64[ns]'), + dtype=dtype + ) + + +@pytest.fixture +def data_for_sorting(dtype): + a = pd.Timestamp('2000-01-01') + b = pd.Timestamp('2000-01-02') + c = pd.Timestamp('2000-01-03') + return DatetimeArray(np.array([b, c, a], dtype='datetime64[ns]'), + dtype=dtype) + + +@pytest.fixture +def data_missing_for_sorting(dtype): + a = pd.Timestamp('2000-01-01') + b = pd.Timestamp('2000-01-02') + return DatetimeArray(np.array([b, 'NaT', a], dtype='datetime64[ns]'), + dtype=dtype) + + +@pytest.fixture +def data_for_grouping(dtype): + """ + Expected to be like [B, B, NA, NA, A, A, B, C] + + Where A < B < C and NA is missing + """ + a = pd.Timestamp('2000-01-01') + b = pd.Timestamp('2000-01-02') + c = pd.Timestamp('2000-01-03') + na = 'NaT' + return DatetimeArray(np.array([b, b, na, na, a, a, b, c], + dtype='datetime64[ns]'), + dtype=dtype) + + +@pytest.fixture +def na_cmp(): + def cmp(a, b): + return a is pd.NaT and a is b + return cmp + + +@pytest.fixture +def na_value(): + return pd.NaT + + +# ---------------------------------------------------------------------------- +class BaseDatetimeTests(object): + pass + + +# ---------------------------------------------------------------------------- +# Tests +class TestDatetimeDtype(BaseDatetimeTests, base.BaseDtypeTests): + pass + + +class TestConstructors(BaseDatetimeTests, base.BaseConstructorsTests): + pass + + +class TestGetitem(BaseDatetimeTests, base.BaseGetitemTests): + pass + + +class TestMethods(BaseDatetimeTests, base.BaseMethodsTests): + @pytest.mark.skip(reason="Incorrect expected") + def test_value_counts(self, all_data, dropna): + pass + + def test_combine_add(self, data_repeated): + # Timestamp.__add__(Timestamp) not defined + pass + + +class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests): + + def test_array_interface(self, data): + if data.tz: + # np.asarray(DTA) is currently always tz-naive. + pytest.skip("GH-23569") + else: + super(TestInterface, self).test_array_interface(data) + + +class TestArithmeticOps(BaseDatetimeTests, base.BaseArithmeticOpsTests): + implements = {'__sub__', '__rsub__'} + + def test_arith_series_with_scalar(self, data, all_arithmetic_operators): + if all_arithmetic_operators in self.implements: + s = pd.Series(data) + self.check_opname(s, all_arithmetic_operators, s.iloc[0], + exc=None) + else: + # ... but not the rest. + super(TestArithmeticOps, self).test_arith_series_with_scalar( + data, all_arithmetic_operators + ) + + def test_add_series_with_extension_array(self, data): + # Datetime + Datetime not implemented + s = pd.Series(data) + msg = 'cannot add DatetimeArray(Mixin)? and DatetimeArray(Mixin)?' + with pytest.raises(TypeError, match=msg): + s + data + + def test_arith_series_with_array(self, data, all_arithmetic_operators): + if all_arithmetic_operators in self.implements: + s = pd.Series(data) + self.check_opname(s, all_arithmetic_operators, s.iloc[0], + exc=None) + else: + # ... but not the rest. + super(TestArithmeticOps, self).test_arith_series_with_scalar( + data, all_arithmetic_operators + ) + + def test_error(self, data, all_arithmetic_operators): + pass + + @pytest.mark.xfail(reason="different implementation", strict=False) + def test_direct_arith_with_series_returns_not_implemented(self, data): + # Right now, we have trouble with this. Returning NotImplemented + # fails other tests like + # tests/arithmetic/test_datetime64::TestTimestampSeriesArithmetic:: + # test_dt64_seris_add_intlike + return super( + TestArithmeticOps, + self + ).test_direct_arith_with_series_returns_not_implemented(data) + + +class TestCasting(BaseDatetimeTests, base.BaseCastingTests): + pass + + +class TestComparisonOps(BaseDatetimeTests, base.BaseComparisonOpsTests): + + def _compare_other(self, s, data, op_name, other): + # the base test is not appropriate for us. We raise on comparison + # with (some) integers, depending on the value. + pass + + @pytest.mark.xfail(reason="different implementation", strict=False) + def test_direct_arith_with_series_returns_not_implemented(self, data): + return super( + TestComparisonOps, + self + ).test_direct_arith_with_series_returns_not_implemented(data) + + +class TestMissing(BaseDatetimeTests, base.BaseMissingTests): + pass + + +class TestReshaping(BaseDatetimeTests, base.BaseReshapingTests): + + @pytest.mark.skip(reason="We have DatetimeTZBlock") + def test_concat(self, data, in_frame): + pass + + def test_concat_mixed_dtypes(self, data): + # concat(Series[datetimetz], Series[category]) uses a + # plain np.array(values) on the DatetimeArray, which + # drops the tz. + super(TestReshaping, self).test_concat_mixed_dtypes(data) + + @pytest.mark.parametrize("obj", ["series", "frame"]) + def test_unstack(self, obj): + # GH-13287: can't use base test, since building the expected fails. + data = DatetimeArray._from_sequence(['2000', '2001', '2002', '2003'], + tz='US/Central') + index = pd.MultiIndex.from_product(([['A', 'B'], ['a', 'b']]), + names=['a', 'b']) + + if obj == "series": + ser = pd.Series(data, index=index) + expected = pd.DataFrame({ + "A": data.take([0, 1]), + "B": data.take([2, 3]) + }, index=pd.Index(['a', 'b'], name='b')) + expected.columns.name = 'a' + + else: + ser = pd.DataFrame({"A": data, "B": data}, index=index) + expected = pd.DataFrame( + {("A", "A"): data.take([0, 1]), + ("A", "B"): data.take([2, 3]), + ("B", "A"): data.take([0, 1]), + ("B", "B"): data.take([2, 3])}, + index=pd.Index(['a', 'b'], name='b') + ) + expected.columns.names = [None, 'a'] + + result = ser.unstack(0) + self.assert_equal(result, expected) + + +class TestSetitem(BaseDatetimeTests, base.BaseSetitemTests): + pass + + +class TestGroupby(BaseDatetimeTests, base.BaseGroupbyTests): + pass + + +class TestPrinting(BaseDatetimeTests, base.BasePrintingTests): + pass diff --git a/pandas/tests/frame/test_indexing.py b/pandas/tests/frame/test_indexing.py index 6e006c1707604..418046e42d581 100644 --- a/pandas/tests/frame/test_indexing.py +++ b/pandas/tests/frame/test_indexing.py @@ -3244,10 +3244,10 @@ def test_setitem(self): # are copies) b1 = df._data.blocks[1] b2 = df._data.blocks[2] - assert b1.values.equals(b2.values) - if b1.values.values.base is not None: + tm.assert_extension_array_equal(b1.values, b2.values) + if b1.values._data.base is not None: # base being None suffices to assure a copy was made - assert id(b1.values.values.base) != id(b2.values.values.base) + assert id(b1.values._data.base) != id(b2.values._data.base) # with nan df2 = df.copy() diff --git a/pandas/tests/frame/test_timezones.py b/pandas/tests/frame/test_timezones.py index cd93f3a1148dd..f124a4c3f3570 100644 --- a/pandas/tests/frame/test_timezones.py +++ b/pandas/tests/frame/test_timezones.py @@ -12,12 +12,47 @@ from pandas.core.dtypes.dtypes import DatetimeTZDtype +import pandas as pd from pandas import DataFrame, Series from pandas.core.indexes.datetimes import date_range import pandas.util.testing as tm class TestDataFrameTimezones(object): + + def test_frame_values_with_tz(self): + tz = "US/Central" + df = DataFrame({"A": date_range('2000', periods=4, tz=tz)}) + result = df.values + expected = np.array([ + [pd.Timestamp('2000-01-01', tz=tz)], + [pd.Timestamp('2000-01-02', tz=tz)], + [pd.Timestamp('2000-01-03', tz=tz)], + [pd.Timestamp('2000-01-04', tz=tz)], + ]) + tm.assert_numpy_array_equal(result, expected) + + # two columns, homogenous + + df = df.assign(B=df.A) + result = df.values + expected = np.concatenate([expected, expected], axis=1) + tm.assert_numpy_array_equal(result, expected) + + # three columns, heterogenous + est = "US/Eastern" + df = df.assign(C=df.A.dt.tz_convert(est)) + + new = np.array([ + [pd.Timestamp('2000-01-01T01:00:00', tz=est)], + [pd.Timestamp('2000-01-02T01:00:00', tz=est)], + [pd.Timestamp('2000-01-03T01:00:00', tz=est)], + [pd.Timestamp('2000-01-04T01:00:00', tz=est)], + ]) + expected = np.concatenate([expected, new], axis=1) + result = df.values + tm.assert_numpy_array_equal(result, expected) + def test_frame_from_records_utc(self): rec = {'datum': 1.5, 'begin_time': datetime(2006, 4, 27, tzinfo=pytz.utc)} diff --git a/pandas/tests/indexes/timedeltas/test_construction.py b/pandas/tests/indexes/timedeltas/test_construction.py index a4e925f6611f9..b9bbfaff06215 100644 --- a/pandas/tests/indexes/timedeltas/test_construction.py +++ b/pandas/tests/indexes/timedeltas/test_construction.py @@ -26,7 +26,7 @@ def test_int64_nocopy(self): # and copy=False arr = np.arange(10, dtype=np.int64) tdi = TimedeltaIndex(arr, copy=False) - assert tdi._data.base is arr + assert tdi._data._data.base is arr def test_infer_from_tdi(self): # GH#23539 diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 26cd39c4b807c..b9196971d2e53 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -11,8 +11,13 @@ from distutils.version import LooseVersion import itertools from pandas import (Index, MultiIndex, DataFrame, DatetimeIndex, - Series, Categorical, TimedeltaIndex, SparseArray) + Series, Categorical, SparseArray) + from pandas.compat import OrderedDict, lrange +from pandas.core.arrays import ( + DatetimeArrayMixin as DatetimeArray, + TimedeltaArrayMixin as TimedeltaArray, +) from pandas.core.internals import (SingleBlockManager, make_block, BlockManager) import pandas.core.algorithms as algos @@ -290,7 +295,7 @@ def test_make_block_same_class(self): block = create_block('M8[ns, US/Eastern]', [3]) with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False): - block.make_block_same_class(block.values.values, + block.make_block_same_class(block.values, dtype=block.values.dtype) @@ -451,7 +456,7 @@ def test_copy(self, mgr): assert cp_blk.values.base is blk.values.base else: # DatetimeTZBlock has DatetimeIndex values - assert cp_blk.values.values.base is blk.values.values.base + assert cp_blk.values._data.base is blk.values._data.base cp = mgr.copy(deep=True) for blk, cp_blk in zip(mgr.blocks, cp.blocks): @@ -460,7 +465,7 @@ def test_copy(self, mgr): # some blocks it is an array (e.g. datetimetz), but was copied assert cp_blk.equals(blk) if not isinstance(cp_blk.values, np.ndarray): - assert cp_blk.values.values.base is not blk.values.values.base + assert cp_blk.values._data.base is not blk.values._data.base else: assert cp_blk.values.base is None and blk.values.base is None @@ -1258,9 +1263,9 @@ def test_binop_other(self, op, value, dtype): @pytest.mark.parametrize('typestr, holder', [ ('category', Categorical), - ('M8[ns]', DatetimeIndex), - ('M8[ns, US/Central]', DatetimeIndex), - ('m8[ns]', TimedeltaIndex), + ('M8[ns]', DatetimeArray), + ('M8[ns, US/Central]', DatetimeArray), + ('m8[ns]', TimedeltaArray), ('sparse', SparseArray), ]) def test_holder(typestr, holder): diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 9034b964033ed..d985ca4eb67ea 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -8,6 +8,7 @@ import pytest from pandas.compat import PY3 +import pandas.util._test_decorators as td import pandas as pd from pandas.util import testing as tm @@ -469,6 +470,7 @@ def test_partition_cols_supported(self, pa, df_full): class TestParquetFastParquet(Base): + @td.skip_if_no('fastparquet', min_version="0.2.1") def test_basic(self, fp, df_full): df = df_full diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 051462c5e9fc6..3997aade16892 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -1015,13 +1015,17 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self): s = Series({'date': date, 'a': 1.0, 'b': 2.0}) df = DataFrame(columns=['c', 'd']) result = df.append(s, ignore_index=True) + # n.b. it's not clear to me that expected is correct here. + # It's possible that the `date` column should have + # datetime64[ns, tz] dtype for both result and expected. + # that would be more consistent with new columns having + # their own dtype (float for a and b, datetime64ns, tz for date). expected = DataFrame([[np.nan, np.nan, 1., 2., date]], - columns=['c', 'd', 'a', 'b', 'date']) + columns=['c', 'd', 'a', 'b', 'date'], + dtype=object) # These columns get cast to object after append - object_cols = ['c', 'd', 'date'] - expected.loc[:, object_cols] = expected.loc[:, object_cols].astype( - object - ) + expected['a'] = expected['a'].astype(float) + expected['b'] = expected['b'].astype(float) assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_block_internals.py b/pandas/tests/series/test_block_internals.py index ccfb169cc2f8d..e74b32181ce0f 100644 --- a/pandas/tests/series/test_block_internals.py +++ b/pandas/tests/series/test_block_internals.py @@ -16,14 +16,14 @@ def test_setitem_invalidates_datetime_index_freq(self): ts = dti[1] ser = pd.Series(dti) assert ser._values is not dti - assert ser._values._data.base is not dti._data.base + assert ser._values._data.base is not dti._data._data.base assert dti.freq == 'D' ser.iloc[1] = pd.NaT assert ser._values.freq is None # check that the DatetimeIndex was not altered in place assert ser._values is not dti - assert ser._values._data.base is not dti._data.base + assert ser._values._data.base is not dti._data._data.base assert dti[1] == ts assert dti.freq == 'D' @@ -33,9 +33,10 @@ def test_dt64tz_setitem_does_not_mutate_dti(self): ts = dti[0] ser = pd.Series(dti) assert ser._values is not dti - assert ser._values._data.base is not dti._data.base + assert ser._values._data.base is not dti._data._data.base assert ser._data.blocks[0].values is not dti - assert ser._data.blocks[0].values._data.base is not dti._data.base + assert (ser._data.blocks[0].values._data.base + is not dti._data._data.base) ser[::3] = pd.NaT assert ser[0] is pd.NaT diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 85650a9b0df0d..dbdbb0bc238a9 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -16,6 +16,7 @@ from pandas.core.dtypes.common import ( is_datetime64_dtype, is_datetime64tz_dtype, is_object_dtype, is_timedelta64_dtype, needs_i8_conversion) +from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd from pandas import ( @@ -365,7 +366,9 @@ def test_value_counts_unique_nunique(self): else: expected_index = Index(values[::-1]) idx = o.index.repeat(range(1, len(o) + 1)) - rep = np.repeat(values, range(1, len(o) + 1)) + # take-based repeat + indices = np.repeat(np.arange(len(o)), range(1, len(o) + 1)) + rep = values.take(indices) o = klass(rep, index=idx, name='a') # check values has the same dtype as the original @@ -1154,14 +1157,32 @@ def test_iter_box(self): (np.array([0, 1], dtype=np.int64), np.ndarray, 'int64'), (np.array(['a', 'b']), np.ndarray, 'object'), (pd.Categorical(['a', 'b']), pd.Categorical, 'category'), - (pd.DatetimeIndex(['2017', '2018']), np.ndarray, 'datetime64[ns]'), (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), DatetimeArray, 'datetime64[ns, US/Central]'), - (pd.TimedeltaIndex([10**10]), np.ndarray, 'm8[ns]'), + (pd.PeriodIndex([2018, 2019], freq='A'), pd.core.arrays.PeriodArray, pd.core.dtypes.dtypes.PeriodDtype("A-DEC")), (pd.IntervalIndex.from_breaks([0, 1, 2]), pd.core.arrays.IntervalArray, 'interval'), + + # This test is currently failing for datetime64[ns] and timedelta64[ns]. + # The NumPy type system is sufficient for representing these types, so + # we just use NumPy for Series / DataFrame columns of these types (so + # we get consolidation and so on). + # However, DatetimeIndex and TimedeltaIndex use the DateLikeArray + # abstraction to for code reuse. + # At the moment, we've judged that allowing this test to fail is more + # practical that overriding Series._values to special case + # Series[M8[ns]] and Series[m8[ns]] to return a DateLikeArray. + pytest.param( + pd.DatetimeIndex(['2017', '2018']), np.ndarray, 'datetime64[ns]', + marks=[pytest.mark.xfail(reason="datetime _values", strict=True)] + ), + pytest.param( + pd.TimedeltaIndex([10**10]), np.ndarray, 'm8[ns]', + marks=[pytest.mark.xfail(reason="timedelta _values", strict=True)] + ), + ]) def test_values_consistent(array, expected_type, dtype): l_values = pd.Series(array)._values @@ -1218,7 +1239,13 @@ def test_numpy_array_all_dtypes(any_numpy_dtype): (pd.core.arrays.integer_array([0, np.nan]), '_data'), (pd.core.arrays.IntervalArray.from_breaks([0, 1]), '_left'), (pd.SparseArray([0, 1]), '_sparse_values'), - # TODO: DatetimeArray(add) + (DatetimeArray(np.array([1, 2], dtype="datetime64[ns]")), "_data"), + # tz-aware Datetime + (DatetimeArray(np.array(['2000-01-01T12:00:00', + '2000-01-02T12:00:00'], + dtype='M8[ns]'), + dtype=DatetimeTZDtype(tz="US/Central")), + '_data'), ]) @pytest.mark.parametrize('box', [pd.Series, pd.Index]) def test_array(array, attr, box): @@ -1249,7 +1276,22 @@ def test_array_multiindex_raises(): (pd.core.arrays.IntervalArray.from_breaks([0, 1, 2]), np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object)), (pd.SparseArray([0, 1]), np.array([0, 1], dtype=np.int64)), - # TODO: DatetimeArray(add) + + # tz-naive datetime + (DatetimeArray(np.array(['2000', '2001'], dtype='M8[ns]')), + np.array(['2000', '2001'], dtype='M8[ns]')), + + # tz-aware stays tz`-aware + (DatetimeArray(np.array(['2000-01-01T06:00:00', + '2000-01-02T06:00:00'], + dtype='M8[ns]'), + dtype=DatetimeTZDtype(tz='US/Central')), + np.array([pd.Timestamp('2000-01-01', tz='US/Central'), + pd.Timestamp('2000-01-02', tz='US/Central')])), + + # Timedelta + (TimedeltaArray(np.array([0, 3600000000000], dtype='i8'), freq='H'), + np.array([0, 3600000000000], dtype='m8[ns]')), ]) @pytest.mark.parametrize('box', [pd.Series, pd.Index]) def test_to_numpy(array, expected, box): @@ -1290,13 +1332,18 @@ def test_to_numpy_dtype(as_series): obj = pd.DatetimeIndex(['2000', '2001'], tz=tz) if as_series: obj = pd.Series(obj) - result = obj.to_numpy(dtype=object) + + # preserve tz by default + result = obj.to_numpy() expected = np.array([pd.Timestamp('2000', tz=tz), pd.Timestamp('2001', tz=tz)], dtype=object) tm.assert_numpy_array_equal(result, expected) - result = obj.to_numpy() + result = obj.to_numpy(dtype="object") + tm.assert_numpy_array_equal(result, expected) + + result = obj.to_numpy(dtype="M8[ns]") expected = np.array(['2000-01-01T05', '2001-01-01T05'], dtype='M8[ns]') tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/test_panel.py b/pandas/tests/test_panel.py index 5539778e1d187..ba0ad72e624f7 100644 --- a/pandas/tests/test_panel.py +++ b/pandas/tests/test_panel.py @@ -469,7 +469,8 @@ def test_delitem_and_pop(self): def test_setitem(self): lp = self.panel.filter(['ItemA', 'ItemB']).to_frame() - with pytest.raises(ValueError): + + with pytest.raises(TypeError): self.panel['ItemE'] = lp # DataFrame diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 3c902ce7dc0d8..a6ba62bbdea1e 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1313,6 +1313,13 @@ def assert_series_equal(left, right, check_dtype=True, elif is_interval_dtype(left) or is_interval_dtype(right): assert_interval_array_equal(left.array, right.array) + elif (is_extension_array_dtype(left.dtype) and + is_datetime64tz_dtype(left.dtype)): + # .values is an ndarray, but ._values is the ExtensionArray. + # TODO: Use .array + assert is_extension_array_dtype(right.dtype) + return assert_extension_array_equal(left._values, right._values) + elif (is_extension_array_dtype(left) and not is_categorical_dtype(left) and is_extension_array_dtype(right) and not is_categorical_dtype(right)): return assert_extension_array_equal(left.array, right.array)