From 41f09d899c4eaa726f0f0f7ffbc55d924a5dcab7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 3 Feb 2018 14:13:57 -0600 Subject: [PATCH 01/36] REF/Clean: Internal / External values --- doc/source/internals.rst | 15 +++++ pandas/core/base.py | 48 +++++++++++--- pandas/core/dtypes/concat.py | 15 +++-- pandas/core/indexes/base.py | 65 ++++++++++++------- pandas/core/indexes/category.py | 25 +++++-- pandas/core/indexes/datetimelike.py | 2 +- pandas/core/indexes/datetimes.py | 9 +++ pandas/core/indexes/multi.py | 38 ++++++----- pandas/core/indexes/numeric.py | 2 +- pandas/core/indexes/period.py | 42 +++++++----- pandas/core/series.py | 4 +- pandas/io/formats/format.py | 2 +- pandas/io/pytables.py | 2 +- pandas/plotting/_converter.py | 6 +- pandas/tests/indexes/common.py | 6 +- .../tests/indexes/period/test_construction.py | 4 +- pandas/tests/indexes/period/test_period.py | 6 +- pandas/tests/indexes/period/test_tools.py | 2 +- pandas/tests/test_base.py | 65 ++++++++++++++++++- 19 files changed, 265 insertions(+), 93 deletions(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index ee4df879d9478..29aaed318b802 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -89,6 +89,21 @@ not check (or care) whether the levels themselves are sorted. Fortunately, the constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but if you compute the levels and labels yourself, please be careful. +Values +~~~~~~ + +Pandas extends NumPy's type system in a few places, so we have multiple notions of "values" floating around. +For 1-D containers (``Index`` classes and ``Series``) we have the following convention: + +* ``cls._ndarray_values`` is *always* and ``ndarray`` +* ``cls._values`` refers is the "best possible" array. This could be an ``ndarray``, ``ExtensionArray``, or + in ``Index`` subclass (note: we're in the process of removing the index subclasses here so that it's + always an ``ndarray`` or ``ExtensionArray``). + +So, for example, ``Series[category]._values`` is a ``Categorical``, while ``Series[category]._ndarray_values`` is +the underlying ndarray. + + .. _ref-subclassing-pandas: Subclassing pandas Data Structures diff --git a/pandas/core/base.py b/pandas/core/base.py index d5b204dba063e..52b1f82e8824d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -7,7 +7,8 @@ import numpy as np from pandas.core.dtypes.missing import isna -from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass +from pandas.core.dtypes.generic import ( + ABCDataFrame, ABCSeries, ABCIndexClass, ABCDatetimeIndex) from pandas.core.dtypes.common import ( is_object_dtype, is_list_like, @@ -706,7 +707,7 @@ def transpose(self, *args, **kwargs): @property def shape(self): """ return a tuple of the shape of the underlying data """ - return self._values.shape + return self._ndarray_values.shape @property def ndim(self): @@ -734,22 +735,22 @@ def data(self): @property def itemsize(self): """ return the size of the dtype of the item of the underlying data """ - return self._values.itemsize + return self._ndarray_values.itemsize @property def nbytes(self): """ return the number of bytes in the underlying data """ - return self._values.nbytes + return self._ndarray_values.nbytes @property def strides(self): """ return the strides of the underlying data """ - return self._values.strides + return self._ndarray_values.strides @property def size(self): """ return the number of elements in the underlying data """ - return self._values.size + return self._ndarray_values.size @property def flags(self): @@ -763,9 +764,34 @@ def base(self): """ return self.values.base + @property + def _ndarray_values(self): + """The data as an ndarray. See '_values' for more.""" + # type: () -> np.ndarray + return self.values + @property def _values(self): - """ the internal implementation """ + # type: () -> Union[ExtensionArray, Index] + # TODO: remove index types as they become is extension arrays + """ The best array representation. + + This is an ndarray, ExtensionArray, or Index subclass. This differs + from '._ndarray_values', which always returns an ndarray. It may differ + from the public '.values' + + index | values | _values + ----------------- | -------------- -| ---------- + CategoricalIndex | Categorical | Categorical + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] + PeriodIndex | ndarray[Period] | ndarray[Pd] (soon PeriodArray) + IntervalIndex | ndarray[IV] | ndarray[IV] (soon IntervalArray) + + See Also + -------- + values + _ndarray_values + """ return self.values @property @@ -816,7 +842,7 @@ def tolist(self): if is_datetimelike(self): return [com._maybe_box_datetimelike(x) for x in self._values] else: - return self._values.tolist() + return self._ndarray_values.tolist() def __iter__(self): """ @@ -973,8 +999,12 @@ def value_counts(self, normalize=False, sort=True, ascending=False, @Appender(_shared_docs['unique'] % _indexops_doc_kwargs) def unique(self): values = self._values - + if isinstance(values, ABCDatetimeIndex): + values = values._ndarray_values + # TODO: Make unique part of the ExtensionArray interface. + # else, this could be surprising. if hasattr(values, 'unique'): + result = values.unique() else: from pandas.core.algorithms import unique1d diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index ddecbe85087d8..a49a2680e4daa 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -480,7 +480,7 @@ def _concat_datetimetz(to_concat, name=None): def _concat_index_same_dtype(indexes, klass=None): klass = klass if klass is not None else indexes[0].__class__ - return klass(np.concatenate([x._values for x in indexes])) + return klass(np.concatenate([x._ndarray_values for x in indexes])) def _concat_index_asobject(to_concat, name=None): @@ -498,9 +498,16 @@ def _concat_index_asobject(to_concat, name=None): attribs = self._get_attributes_dict() attribs['name'] = name - to_concat = [x._values if isinstance(x, Index) else x - for x in to_concat] - return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs) + arrays = [] + for x in to_concat: + if is_categorical_dtype(x): + arrays.append(np.asarray(x, dtype=object)) + elif isinstance(x, Index): + arrays.append(x._values) + else: + arrays.append(x) + + return self._shallow_copy_with_infer(np.concatenate(arrays), **attribs) def _concat_sparse(to_concat, axis=0, typs=None): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 1e1bb0d49b3df..450e0f47ef6ff 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -392,7 +392,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): values = np.array(values, copy=False) if is_object_dtype(values): values = cls(values, name=name, dtype=dtype, - **kwargs)._values + **kwargs)._ndarray_values result = object.__new__(cls) result._data = values @@ -644,7 +644,7 @@ def ravel(self, order='C'): -------- numpy.ndarray.ravel """ - return self._values.ravel(order=order) + return self._ndarray_values.ravel(order=order) # construction helpers @classmethod @@ -1577,7 +1577,7 @@ def _constructor(self): @cache_readonly def _engine(self): # property, for now, slow to look up - return self._engine_type(lambda: self._values, len(self)) + return self._engine_type(lambda: self._ndarray_values, len(self)) def _validate_index_level(self, level): """ @@ -2208,27 +2208,37 @@ def union(self, other): other = other.astype('O') return this.union(other) + if is_categorical_dtype(self): + lvals = self.values + else: + lvals = self._ndarray_values + + if is_categorical_dtype(other): + rvals = other.values + else: + rvals = other._ndarray_values + if self.is_monotonic and other.is_monotonic: try: - result = self._outer_indexer(self._values, other._values)[0] + result = self._outer_indexer(lvals, rvals)[0] except TypeError: # incomparable objects - result = list(self._values) + result = list(lvals) # worth making this faster? a very unusual case - value_set = set(self._values) - result.extend([x for x in other._values if x not in value_set]) + value_set = set(lvals) + result.extend([x for x in rvals if x not in value_set]) else: indexer = self.get_indexer(other) indexer, = (indexer == -1).nonzero() if len(indexer) > 0: - other_diff = algos.take_nd(other._values, indexer, + other_diff = algos.take_nd(rvals, indexer, allow_fill=False) - result = _concat._concat_compat((self._values, other_diff)) + result = _concat._concat_compat((lvals, other_diff)) try: - self._values[0] < other_diff[0] + lvals[0] < other_diff[0] except TypeError as e: warnings.warn("%s, sort order is undefined for " "incomparable objects" % e, RuntimeWarning, @@ -2240,7 +2250,7 @@ def union(self, other): result.sort() else: - result = self._values + result = lvals try: result = np.sort(result) @@ -2293,18 +2303,21 @@ def intersection(self, other): if self.is_monotonic and other.is_monotonic: try: - result = self._inner_indexer(self._values, other._values)[0] + result = self._inner_indexer(self._ndarray_values, + other._ndarray_values)[0] return self._wrap_union_result(other, result) except TypeError: pass try: - indexer = Index(other._values).get_indexer(self._values) + indexer = Index(other._ndarray_values).get_indexer( + self._ndarray_values) indexer = indexer.take((indexer != -1).nonzero()[0]) except Exception: # duplicates indexer = algos.unique1d( - Index(other._values).get_indexer_non_unique(self._values)[0]) + Index(other._ndarray_values).get_indexer_non_unique( + self._ndarray_values)[0]) indexer = indexer[indexer != -1] taken = other.take(indexer) @@ -2680,7 +2693,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): raise ValueError('limit argument only valid if doing pad, ' 'backfill or nearest reindexing') - indexer = self._engine.get_indexer(target._values) + indexer = self._engine.get_indexer(target._ndarray_values) return _ensure_platform_int(indexer) @@ -2696,12 +2709,13 @@ def _get_fill_indexer(self, target, method, limit=None, tolerance=None): if self.is_monotonic_increasing and target.is_monotonic_increasing: method = (self._engine.get_pad_indexer if method == 'pad' else self._engine.get_backfill_indexer) - indexer = method(target._values, limit) + indexer = method(target._ndarray_values, limit) else: indexer = self._get_fill_indexer_searchsorted(target, method, limit) if tolerance is not None: - indexer = self._filter_indexer_tolerance(target._values, indexer, + indexer = self._filter_indexer_tolerance(target._ndarray_values, + indexer, tolerance) return indexer @@ -2792,7 +2806,7 @@ def get_indexer_non_unique(self, target): self = Index(self.asi8) tgt_values = target.asi8 else: - tgt_values = target._values + tgt_values = target._ndarray_values indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return _ensure_platform_int(indexer), missing @@ -3227,16 +3241,17 @@ def _join_multi(self, other, how, return_indexers=True): def _join_non_unique(self, other, how='left', return_indexers=False): from pandas.core.reshape.merge import _get_join_indexers - left_idx, right_idx = _get_join_indexers([self._values], - [other._values], how=how, + left_idx, right_idx = _get_join_indexers([self._ndarray_values], + [other._ndarray_values], + how=how, sort=True) left_idx = _ensure_platform_int(left_idx) right_idx = _ensure_platform_int(right_idx) - join_index = np.asarray(self._values.take(left_idx)) + join_index = np.asarray(self._ndarray_values.take(left_idx)) mask = left_idx == -1 - np.putmask(join_index, mask, other._values.take(right_idx)) + np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) join_index = self._wrap_joined_index(join_index, other) @@ -3383,8 +3398,8 @@ def _join_monotonic(self, other, how='left', return_indexers=False): else: return ret_index - sv = self._values - ov = other._values + sv = self._ndarray_values + ov = other._ndarray_values if self.is_unique and other.is_unique: # We can perform much better than the general case @@ -3736,7 +3751,7 @@ def insert(self, loc, item): item = self._na_value _self = np.asarray(self) - item = self._coerce_scalar_to_index(item)._values + item = self._coerce_scalar_to_index(item)._ndarray_values idx = np.concatenate((_self[:loc], item, _self[loc:])) return self._shallow_copy_with_infer(idx) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 2c7be2b21f959..5b01f7d2cbe95 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -227,7 +227,7 @@ def _is_dtype_compat(self, other): """ if is_categorical_dtype(other): if isinstance(other, CategoricalIndex): - other = other._values + other = other.values if not other.is_dtype_equal(self): raise TypeError("categories must match existing categories " "when appending") @@ -293,6 +293,23 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data + @property + def _values(self): + return self._data + + @property + def _ndarray_values(self): + return self._data.codes + + @property + def itemsize(self): + return self.values.itemsize + + @property + def nbytes(self): + """ return the number of bytes in the underlying data """ + return self.values.nbytes + def get_values(self): """ return the underlying data as an ndarray """ return self._data.get_values() @@ -386,8 +403,8 @@ def is_monotonic_decreasing(self): def unique(self, level=None): if level is not None: self._validate_index_level(level) - result = base.IndexOpsMixin.unique(self) - # CategoricalIndex._shallow_copy uses keeps original categories + result = self.values.unique() + # CategoricalIndex._shallow_copy keeps original categories # and ordered if not otherwise specified return self._shallow_copy(result, categories=result.categories, ordered=result.ordered) @@ -762,7 +779,7 @@ def _evaluate_compare(self, other): def _delegate_method(self, name, *args, **kwargs): """ method delegation to the ._values """ - method = getattr(self._values, name) + method = getattr(self.values, name) if 'inplace' in kwargs: raise ValueError("cannot use inplace with CategoricalIndex") res = method(*args, **kwargs) diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 8e77c7a7fa48c..94500a58edd4c 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -389,7 +389,7 @@ def sort_values(self, return_indexer=False, ascending=True): sorted_index = self.take(_as) return sorted_index, _as else: - sorted_values = np.sort(self._values) + sorted_values = np.sort(self._ndarray_values) attribs = self._get_attributes_dict() freq = attribs['freq'] diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index e09fa87477122..c32d7ce930a7c 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -678,6 +678,15 @@ def _assert_tzawareness_compat(self, other): raise TypeError('Cannot compare tz-naive and tz-aware ' 'datetime-like objects') + @property + def _values(self): + # tz-naive -> ndarray + # tz-aware -> DatetimeIndex + if self.tz is not None: + return self + else: + return self.values + @property def tzinfo(self): """ diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 510f7245cebd8..1478012aa9dbe 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -799,9 +799,11 @@ def values(self): box = hasattr(lev, '_box_values') # Try to minimize boxing. if box and len(lev) > len(lab): - taken = lev._box_values(algos.take_1d(lev._values, lab)) + taken = lev._box_values(algos.take_1d(lev._values, + lab)) elif box: - taken = algos.take_1d(lev._box_values(lev._values), lab, + taken = algos.take_1d(lev._box_values(lev._ndarray_values), + lab, fill_value=_get_na_value(lev.dtype.type)) else: taken = algos.take_1d(np.asarray(lev._values), lab) @@ -1317,7 +1319,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): arrays = [[]] * len(names) elif isinstance(tuples, (np.ndarray, Index)): if isinstance(tuples, Index): - tuples = tuples._values + tuples = tuples._ndarray_values arrays = list(lib.tuples_to_object_array(tuples).T) elif isinstance(tuples, list): @@ -2410,7 +2412,7 @@ def convert_indexer(start, stop, step, indexer=indexer, labels=labels): mapper = Series(indexer) indexer = labels.take(_ensure_platform_int(indexer)) result = Series(Index(indexer).isin(r).nonzero()[0]) - m = result.map(mapper)._values + m = result.map(mapper)._ndarray_values else: m = np.zeros(len(labels), dtype=bool) @@ -2569,7 +2571,7 @@ def _update_indexer(idxr, indexer=indexer): else: from .numeric import Int64Index # no matches we are done - return Int64Index([])._values + return Int64Index([])._ndarray_values elif com.is_null_slice(k): # empty slice @@ -2589,8 +2591,8 @@ def _update_indexer(idxr, indexer=indexer): # empty indexer if indexer is None: - return Int64Index([])._values - return indexer._values + return Int64Index([])._ndarray_values + return indexer._ndarray_values def truncate(self, before=None, after=None): """ @@ -2639,7 +2641,7 @@ def equals(self, other): if not isinstance(other, MultiIndex): other_vals = com._values_from_object(_ensure_index(other)) - return array_equivalent(self._values, other_vals) + return array_equivalent(self._ndarray_values, other_vals) if self.nlevels != other.nlevels: return False @@ -2650,13 +2652,15 @@ def equals(self, other): for i in range(self.nlevels): slabels = self.labels[i] slabels = slabels[slabels != -1] - svalues = algos.take_nd(np.asarray(self.levels[i]._values), - slabels, allow_fill=False) + svalues = algos.take_nd( + np.asarray(self.levels[i]._values), + slabels, allow_fill=False) olabels = other.labels[i] olabels = olabels[olabels != -1] - ovalues = algos.take_nd(np.asarray(other.levels[i]._values), - olabels, allow_fill=False) + ovalues = algos.take_nd( + np.asarray(other.levels[i]._values), + olabels, allow_fill=False) # since we use NaT both datetime64 and timedelta64 # we can have a situation where a level is typed say @@ -2704,7 +2708,8 @@ def union(self, other): if len(other) == 0 or self.equals(other): return self - uniq_tuples = lib.fast_unique_multiple([self._values, other._values]) + uniq_tuples = lib.fast_unique_multiple([self._ndarray_values, + other._ndarray_values]) return MultiIndex.from_arrays(lzip(*uniq_tuples), sortorder=0, names=result_names) @@ -2726,8 +2731,8 @@ def intersection(self, other): if self.equals(other): return self - self_tuples = self._values - other_tuples = other._values + self_tuples = self._ndarray_values + other_tuples = other._ndarray_values uniq_tuples = sorted(set(self_tuples) & set(other_tuples)) if len(uniq_tuples) == 0: return MultiIndex(levels=[[]] * self.nlevels, @@ -2756,7 +2761,8 @@ def difference(self, other): labels=[[]] * self.nlevels, names=result_names, verify_integrity=False) - difference = sorted(set(self._values) - set(other._values)) + difference = sorted(set(self._ndarray_values) - + set(other._ndarray_values)) if len(difference) == 0: return MultiIndex(levels=[[]] * self.nlevels, diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index b02aee0495d8c..a4558116bfa63 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -378,7 +378,7 @@ def equals(self, other): if (not is_dtype_equal(self.dtype, other.dtype) or self.shape != other.shape): return False - left, right = self._values, other._values + left, right = self._ndarray_values, other._ndarray_values return ((left == right) | (self._isnan & other._isnan)).all() except (TypeError, ValueError): return False diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 1f8542ed5ee60..c8b7d6063e378 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -54,7 +54,7 @@ def _field_accessor(name, alias, docstring=None): def f(self): base, mult = _gfc(self.freq) - result = get_period_field_arr(alias, self._values, base) + result = get_period_field_arr(alias, self._ndarray_values, base) return Index(result, name=self.name) f.__name__ = name f.__doc__ = docstring @@ -82,7 +82,7 @@ def _period_index_cmp(opname, cls, nat_result=False): def wrapper(self, other): if isinstance(other, Period): - func = getattr(self._values, opname) + func = getattr(self._ndarray_values, opname) other_base, _ = _gfc(other.freq) if other.freq != self.freq: msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) @@ -94,7 +94,8 @@ def wrapper(self, other): msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) - result = getattr(self._values, opname)(other._values) + op = getattr(self._ndarray_values, opname) + result = op(other._ndarray_values) mask = self._isnan | other._isnan if mask.any(): @@ -102,11 +103,11 @@ def wrapper(self, other): return result elif other is tslib.NaT: - result = np.empty(len(self._values), dtype=bool) + result = np.empty(len(self._ndarray_values), dtype=bool) result.fill(nat_result) else: other = Period(other, freq=self.freq) - func = getattr(self._values, opname) + func = getattr(self._ndarray_values, opname) result = func(other.ordinal) if self.hasnans: @@ -275,11 +276,11 @@ def __new__(cls, data=None, ordinal=None, freq=None, start=None, end=None, if isinstance(data, PeriodIndex): if freq is None or freq == data.freq: # no freq change freq = data.freq - data = data._values + data = data._ndarray_values else: base1, _ = _gfc(data.freq) base2, _ = _gfc(freq) - data = period.period_asfreq_arr(data._values, + data = period.period_asfreq_arr(data._ndarray_values, base1, base2, 1) return cls._simple_new(data, name=name, freq=freq) @@ -374,7 +375,7 @@ def _shallow_copy(self, values=None, freq=None, **kwargs): if freq is None: freq = self.freq if values is None: - values = self._values + values = self._ndarray_values return super(PeriodIndex, self)._shallow_copy(values=values, freq=freq, **kwargs) @@ -407,7 +408,7 @@ def __contains__(self, key): @property def asi8(self): - return self._values.view('i8') + return self._ndarray_values.view('i8') @cache_readonly def _int64index(self): @@ -419,6 +420,12 @@ def values(self): @property def _values(self): + # TODO: return PeriodArray + return self.values + + @property + def _ndarray_values(self): + # Ordinals return self._data def __array__(self, dtype=None): @@ -489,13 +496,15 @@ def asof_locs(self, where, mask): if isinstance(where_idx, DatetimeIndex): where_idx = PeriodIndex(where_idx.values, freq=self.freq) - locs = self._values[mask].searchsorted(where_idx._values, side='right') + locs = self._ndarray_values[mask].searchsorted( + where_idx._ndarray_values, side='right') locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self))[mask].take(locs) first = mask.argmax() - result[(locs == 0) & (where_idx._values < self._values[first])] = -1 + result[(locs == 0) & (where_idx._ndarray_values < + self._ndarray_values[first])] = -1 return result @@ -523,7 +532,8 @@ def searchsorted(self, value, side='left', sorter=None): elif isinstance(value, compat.string_types): value = Period(value, freq=self.freq).ordinal - return self._values.searchsorted(value, side=side, sorter=sorter) + return self._ndarray_values.searchsorted(value, side=side, + sorter=sorter) @property def is_all_dates(self): @@ -664,7 +674,7 @@ def to_timestamp(self, freq=None, how='start'): base, mult = _gfc(freq) new_data = self.asfreq(freq, how) - new_data = period.periodarr_to_dt64arr(new_data._values, base) + new_data = period.periodarr_to_dt64arr(new_data._ndarray_values, base) return DatetimeIndex(new_data, freq='infer', name=self.name) def _maybe_convert_timedelta(self, other): @@ -744,7 +754,7 @@ def shift(self, n): ------- shifted : PeriodIndex """ - values = self._values + n * self.freq.n + values = self._ndarray_values + n * self.freq.n if self.hasnans: values[self._isnan] = tslib.iNaT return self._shallow_copy(values=values) @@ -775,7 +785,7 @@ def get_value(self, series, key): grp = resolution.Resolution.get_freq_group(reso) freqn = resolution.get_freq_group(self.freq) - vals = self._values + vals = self._ndarray_values # if our data is higher resolution than requested key, slice if grp < freqn: @@ -786,7 +796,7 @@ def get_value(self, series, key): if ord2 < vals[0] or ord1 > vals[-1]: raise KeyError(key) - pos = np.searchsorted(self._values, [ord1, ord2]) + pos = np.searchsorted(self._ndarray_values, [ord1, ord2]) key = slice(pos[0], pos[1] + 1) return series[key] elif grp == freqn: diff --git a/pandas/core/series.py b/pandas/core/series.py index e4b8979d6393a..b0ad76d12f1d9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1303,7 +1303,9 @@ def unique(self): # to return an object array of tz-aware Timestamps # TODO: it must return DatetimeArray with tz in pandas 2.0 - result = result.astype(object).values + # XXX: This surely will have issues around DST boundaries. + result = (DatetimeIndex(result, tz='UTC').tz_convert(self.dtype.tz) + .astype(object).values) return result diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 269c81b380b5e..bbeb9e162452d 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1884,7 +1884,7 @@ def _format(x): vals = self.values if isinstance(vals, Index): - vals = vals._values + vals = vals._ndarray_values elif isinstance(vals, ABCSparseArray): vals = vals.values diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 0d833807602e1..2437b7d396e84 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -4430,7 +4430,7 @@ def _convert_index(index, encoding=None, format_type=None): elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() # avoid to store ndarray of Period objects - return IndexCol(index._values, 'integer', atom, + return IndexCol(index._ndarray_values, 'integer', atom, freq=getattr(index, 'freq', None), index_name=index_name) diff --git a/pandas/plotting/_converter.py b/pandas/plotting/_converter.py index 07163615c6ba4..9ca06475290e4 100644 --- a/pandas/plotting/_converter.py +++ b/pandas/plotting/_converter.py @@ -249,11 +249,11 @@ def _convert_1d(values, units, axis): is_float(values)): return get_datevalue(values, axis.freq) if isinstance(values, PeriodIndex): - return values.asfreq(axis.freq)._values + return values.asfreq(axis.freq)._ndarray_values if isinstance(values, Index): return values.map(lambda x: get_datevalue(x, axis.freq)) if is_period_arraylike(values): - return PeriodIndex(values, freq=axis.freq)._values + return PeriodIndex(values, freq=axis.freq)._ndarray_values if isinstance(values, (list, tuple, np.ndarray, Index)): return [get_datevalue(x, axis.freq) for x in values] return values @@ -642,7 +642,7 @@ def _daily_finder(vmin, vmax, freq): info = np.zeros(span, dtype=[('val', np.int64), ('maj', bool), ('min', bool), ('fmt', '|S20')]) - info['val'][:] = dates_._values + info['val'][:] = dates_._ndarray_values info['fmt'][:] = '' info['maj'][[0, -1]] = True # .. and set some shortcuts diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 8948c5f79900d..2d8d70aa2ac84 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -314,7 +314,8 @@ def test_ensure_copied_data(self): # .values an object array of Period, thus copied result = index_type(ordinal=index.asi8, copy=False, **init_kwargs) - tm.assert_numpy_array_equal(index._values, result._values, + tm.assert_numpy_array_equal(index._ndarray_values, + result._ndarray_values, check_same='same') elif isinstance(index, IntervalIndex): # checked in test_interval.py @@ -323,7 +324,8 @@ def test_ensure_copied_data(self): result = index_type(index.values, copy=False, **init_kwargs) tm.assert_numpy_array_equal(index.values, result.values, check_same='same') - tm.assert_numpy_array_equal(index._values, result._values, + tm.assert_numpy_array_equal(index._ndarray_values, + result._ndarray_values, check_same='same') def test_copy_and_deepcopy(self, indices): diff --git a/pandas/tests/indexes/period/test_construction.py b/pandas/tests/indexes/period/test_construction.py index 639a9272c3808..eca80d17b1dc3 100644 --- a/pandas/tests/indexes/period/test_construction.py +++ b/pandas/tests/indexes/period/test_construction.py @@ -119,8 +119,8 @@ def test_constructor_fromarraylike(self): tm.assert_index_equal(PeriodIndex(idx.values), idx) tm.assert_index_equal(PeriodIndex(list(idx.values)), idx) - pytest.raises(ValueError, PeriodIndex, idx._values) - pytest.raises(ValueError, PeriodIndex, list(idx._values)) + pytest.raises(ValueError, PeriodIndex, idx._ndarray_values) + pytest.raises(ValueError, PeriodIndex, list(idx._ndarray_values)) pytest.raises(TypeError, PeriodIndex, data=Period('2007', freq='A')) diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 6fc7fa5486f82..e3b1256fa0584 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -205,7 +205,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) exp = np.array([], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) idx = pd.PeriodIndex(['2011-01', pd.NaT], freq='M') @@ -213,7 +213,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) exp = np.array([492, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) idx = pd.PeriodIndex(['2011-01-01', pd.NaT], freq='D') @@ -222,7 +222,7 @@ def test_values(self): tm.assert_numpy_array_equal(idx.values, exp) tm.assert_numpy_array_equal(idx.get_values(), exp) exp = np.array([14975, -9223372036854775808], dtype=np.int64) - tm.assert_numpy_array_equal(idx._values, exp) + tm.assert_numpy_array_equal(idx._ndarray_values, exp) def test_period_index_length(self): pi = PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') diff --git a/pandas/tests/indexes/period/test_tools.py b/pandas/tests/indexes/period/test_tools.py index 0e72cadb5d494..f5a62371ae799 100644 --- a/pandas/tests/indexes/period/test_tools.py +++ b/pandas/tests/indexes/period/test_tools.py @@ -22,7 +22,7 @@ class TestPeriodRepresentation(object): def _check_freq(self, freq, base_date): rng = PeriodIndex(start=base_date, periods=10, freq=freq) exp = np.arange(10, dtype=np.int64) - tm.assert_numpy_array_equal(rng._values, exp) + tm.assert_numpy_array_equal(rng.asi8, exp) def test_annual(self): diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index df2547fc7b0da..5a67aa3f989ae 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -338,8 +338,9 @@ def test_ops(self): if not isinstance(o, PeriodIndex): expected = getattr(o.values, op)() else: - expected = pd.Period(ordinal=getattr(o._values, op)(), - freq=o.freq) + expected = pd.Period( + ordinal=getattr(o._ndarray_values, op)(), + freq=o.freq) try: assert result == expected except TypeError: @@ -450,7 +451,7 @@ def test_value_counts_unique_nunique_null(self): for orig in self.objs: o = orig.copy() klass = type(o) - values = o._values + values = o._ndarray_values if not self._allow_na_ops(o): continue @@ -1175,3 +1176,61 @@ def test_iter_box(self): assert isinstance(res, pd.Period) assert res.freq == 'M' assert res == exp + + +@pytest.mark.parametrize('arr, expected', [ + (pd.DatetimeIndex(['2017', '2017']), pd.DatetimeIndex(['2017'])), + (pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern'), + pd.DatetimeIndex(['2017'], tz='US/Eastern')), +]) +def test_unique_datetime_index(arr, expected): + result = arr.unique() + + if isinstance(expected, np.ndarray): + tm.assert_numpy_array_equal(result, expected) + if isinstance(expected, pd.Series): + tm.assert_series_equal(result, expected) + if isinstance(expected, pd.DatetimeIndex): + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize('arr, expected', [ + (pd.Series(pd.DatetimeIndex(['2017', '2017'])), + np.array(['2017'], dtype='M8[ns]')), + (pd.Series(pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern')), + np.array([pd.Timestamp('2017', tz="US/Eastern")], dtype=object)), +]) +def test_unique_datetime_series(arr, expected): + result = arr.unique() + + if isinstance(expected, np.ndarray): + tm.assert_numpy_array_equal(result, expected) + if isinstance(expected, pd.Series): + tm.assert_series_equal(result, expected) + if isinstance(expected, pd.DatetimeIndex): + tm.assert_index_equal(result, expected) + + +@pytest.mark.parametrize('array, expected_type', [ + (np.array([0, 1]), np.ndarray), + (np.array(['a', 'b']), np.ndarray), + (pd.Categorical(['a', 'b']), pd.Categorical), + (pd.DatetimeIndex(['2017', '2018']), np.ndarray), + (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray), + (pd.IntervalIndex.from_breaks([0, 1, 2]), np.ndarray), + (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), pd.DatetimeIndex), +]) +def test_values_consistent(array, expected_type): + l_values = pd.Series(array)._values + r_values = pd.Index(array)._values + assert type(l_values) is expected_type + assert type(l_values) is type(r_values) + + if isinstance(l_values, np.ndarray): + tm.assert_numpy_array_equal(l_values, r_values) + elif isinstance(l_values, pd.Index): + tm.assert_index_equal(l_values, r_values) + elif pd.api.types.is_categorical(l_values): + tm.assert_categorical_equal(l_values, r_values) + else: + raise TypeError("Unexpected type {}".format(type(l_values))) From 29cfd7c22dd0b5b67c44144f1520f0bce8bf0e74 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 6 Feb 2018 14:34:22 -0600 Subject: [PATCH 02/36] Move to index base --- pandas/core/base.py | 24 ------------------------ pandas/core/indexes/base.py | 24 ++++++++++++++++++++++++ 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 52b1f82e8824d..ab4c969810c93 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -770,30 +770,6 @@ def _ndarray_values(self): # type: () -> np.ndarray return self.values - @property - def _values(self): - # type: () -> Union[ExtensionArray, Index] - # TODO: remove index types as they become is extension arrays - """ The best array representation. - - This is an ndarray, ExtensionArray, or Index subclass. This differs - from '._ndarray_values', which always returns an ndarray. It may differ - from the public '.values' - - index | values | _values - ----------------- | -------------- -| ---------- - CategoricalIndex | Categorical | Categorical - DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] - PeriodIndex | ndarray[Period] | ndarray[Pd] (soon PeriodArray) - IntervalIndex | ndarray[IV] | ndarray[IV] (soon IntervalArray) - - See Also - -------- - values - _ndarray_values - """ - return self.values - @property def empty(self): return not self.size diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 450e0f47ef6ff..d84c4dcb58f83 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -574,6 +574,30 @@ def values(self): """ return the underlying data as an ndarray """ return self._data.view(np.ndarray) + @property + def _values(self): + # type: () -> Union[ExtensionArray, Index] + # TODO: remove index types as they become is extension arrays + """The best array representation. + + This is an ndarray, ExtensionArray, or Index subclass. This differs + from '._ndarray_values', which always returns an ndarray. It may differ + from the public '.values' + + index | values | _values + ----------------- | -------------- -| ---------- + CategoricalIndex | Categorical | Categorical + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] + PeriodIndex | ndarray[Period] | ndarray[Pd] (soon PeriodArray) + IntervalIndex | ndarray[IV] | ndarray[IV] (soon IntervalArray) + + See Also + -------- + values + _ndarray_values + """ + return self.values + def get_values(self): """ return the underlying data as an ndarray """ return self.values From 3185f4e08fdde6736a02edb52da2647cae8d599c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 13:17:40 -0600 Subject: [PATCH 03/36] Cleanup unique handling --- pandas/core/base.py | 4 +--- pandas/core/indexes/datetimes.py | 12 ++++++++++++ pandas/core/series.py | 4 +--- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index ab4c969810c93..7a8b5f9b608c7 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -975,10 +975,8 @@ def value_counts(self, normalize=False, sort=True, ascending=False, @Appender(_shared_docs['unique'] % _indexops_doc_kwargs) def unique(self): values = self._values - if isinstance(values, ABCDatetimeIndex): - values = values._ndarray_values + # TODO: Make unique part of the ExtensionArray interface. - # else, this could be surprising. if hasattr(values, 'unique'): result = values.unique() diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index c32d7ce930a7c..d749f8aec50cd 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1095,6 +1095,18 @@ def snap(self, freq='S'): # we know it conforms; skip check return DatetimeIndex(snapped, freq=freq, verify_integrity=False) + def unique(self, level=None): + # Override here since IndexOpsMixin.unique uses self._values.unique + # For DatetimeIndex with TZ, that's a DatetimeIndex -> recursion error + # So we extract the tz-naive DatetimeIndex, unique that, and wrap the + # result with out TZ. + if self.tz is not None: + naive = type(self)(self._ndarray_values, copy=False) + else: + naive = self + result = super(DatetimeIndex, naive).unique(level=level) + return self._simple_new(result, name=self.name, tz=self.tz, freq=self.freq) + def union(self, other): """ Specialized union for DatetimeIndex objects. If combine diff --git a/pandas/core/series.py b/pandas/core/series.py index b0ad76d12f1d9..e4b8979d6393a 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1303,9 +1303,7 @@ def unique(self): # to return an object array of tz-aware Timestamps # TODO: it must return DatetimeArray with tz in pandas 2.0 - # XXX: This surely will have issues around DST boundaries. - result = (DatetimeIndex(result, tz='UTC').tz_convert(self.dtype.tz) - .astype(object).values) + result = result.astype(object).values return result From 476f75d3b8cf07fb9965a1fa96dcdf932a01bde8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 14:29:02 -0600 Subject: [PATCH 04/36] Simplify object concat --- pandas/core/dtypes/concat.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index a49a2680e4daa..d6b55d03ebccd 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -493,20 +493,11 @@ def _concat_index_asobject(to_concat, name=None): to_concat = [x.astype(object) if isinstance(x, klasses) else x for x in to_concat] - from pandas import Index self = to_concat[0] attribs = self._get_attributes_dict() attribs['name'] = name - arrays = [] - for x in to_concat: - if is_categorical_dtype(x): - arrays.append(np.asarray(x, dtype=object)) - elif isinstance(x, Index): - arrays.append(x._values) - else: - arrays.append(x) - + arrays = [np.array(x, copy=False, dtype=object) for x in to_concat] return self._shallow_copy_with_infer(np.concatenate(arrays), **attribs) From b15ee5a000003e42bf65389308c7277b6461fd05 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 14:38:58 -0600 Subject: [PATCH 05/36] Use values for intersection I think eventually we'll want to ndarray_values for this, but it'll require a bit more work to support. Currently, using ndarary_values causes occasional failures on categorical. --- pandas/core/indexes/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index dd4c8ac2e86a3..70c0c822fb5e8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2347,8 +2347,7 @@ def intersection(self, other): if self.is_monotonic and other.is_monotonic: try: - result = self._inner_indexer(self._ndarray_values, - other._ndarray_values)[0] + result = self._inner_indexer(self._values, other._values)[0] return self._wrap_union_result(other, result) except TypeError: pass From 659073f8a67e513267048d467da715c60d885c51 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 7 Feb 2018 15:14:32 -0600 Subject: [PATCH 06/36] hmm --- pandas/core/indexes/base.py | 22 +++++++++++++++++++++- pandas/core/indexes/category.py | 17 +++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 70c0c822fb5e8..260016661a735 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2310,6 +2310,24 @@ def _wrap_union_result(self, other, result): name = self.name if self.name == other.name else None return self.__class__(result, name=name) + def _ensure_join(self, values): + """Ensure that the 'values' are ready for our join indexer. + + The default join indexers are object, so this just returns 'values'. + This is called before calling those. + + + Parameters + ---------- + values : array-like + + Returns + ------- + values : ndarray + Expected to have the correct type for self.inner_indexer + """ + return values + def intersection(self, other): """ Form the intersection of two Index objects. @@ -2347,7 +2365,9 @@ def intersection(self, other): if self.is_monotonic and other.is_monotonic: try: - result = self._inner_indexer(self._values, other._values)[0] + lvals = self._ensure_join(self._ndarray_values) + rvals = self._ensure_join(other._ndarray_values) + result = self._inner_indexer(lvals, rvals)[0] return self._wrap_union_result(other, result) except TypeError: pass diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 5b01f7d2cbe95..48cdd28911487 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,5 +1,6 @@ import numpy as np from pandas._libs import index as libindex +from pandas._libs import join as libjoin from pandas import compat from pandas.compat.numpy import function as nv @@ -8,6 +9,8 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, _ensure_platform_int, + _ensure_int32, + _ensure_int64, is_list_like, is_interval_dtype, is_scalar) @@ -214,6 +217,14 @@ def _shallow_copy(self, values=None, categories=None, ordered=None, values=values, categories=categories, ordered=ordered, **kwargs) + @cache_readonly + def _inner_indexer(self): + if self.codes.dtype.itemsize <= 4: + # int8, int16, int32 + return libjoin.inner_join_indexer_int32 + else: + return libjoin.inner_join_indexer_int64 + def _is_dtype_compat(self, other): """ *this is an internal non-public method* @@ -787,6 +798,12 @@ def _delegate_method(self, name, *args, **kwargs): return res return CategoricalIndex(res, name=self.name) + def _ensure_join(self, values): + if self.codes.dtype.itemsize <= 4: + return _ensure_int32(values) + else: + return _ensure_int64(values) + @classmethod def _add_accessors(cls): """ add in Categorical accessor methods """ From 9b8d2a51857a4d8c78ce09c6e54097ab9eddbb08 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 08:54:19 -0600 Subject: [PATCH 07/36] Additional testing --- pandas/tests/test_base.py | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 5a67aa3f989ae..0dbced114ce51 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1211,16 +1211,17 @@ def test_unique_datetime_series(arr, expected): tm.assert_index_equal(result, expected) -@pytest.mark.parametrize('array, expected_type', [ - (np.array([0, 1]), np.ndarray), - (np.array(['a', 'b']), np.ndarray), - (pd.Categorical(['a', 'b']), pd.Categorical), - (pd.DatetimeIndex(['2017', '2018']), np.ndarray), - (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray), - (pd.IntervalIndex.from_breaks([0, 1, 2]), np.ndarray), - (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), pd.DatetimeIndex), +@pytest.mark.parametrize('array, expected_type, dtype', [ + (np.array([0, 1]), np.ndarray, 'int64'), + (np.array(['a', 'b']), np.ndarray, 'object'), + (pd.Categorical(['a', 'b']), pd.Categorical, 'category'), + (pd.DatetimeIndex(['2017', '2018']), np.ndarray, 'datetime64[ns]'), + (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray, 'object'), + (pd.IntervalIndex.from_breaks([0, 1, 2]), np.ndarray, 'object'), + (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), pd.DatetimeIndex, + 'datetime64[ns, US/Central]'), ]) -def test_values_consistent(array, expected_type): +def test_values_consistent(array, expected_type, dtype): l_values = pd.Series(array)._values r_values = pd.Index(array)._values assert type(l_values) is expected_type @@ -1234,3 +1235,13 @@ def test_values_consistent(array, expected_type): tm.assert_categorical_equal(l_values, r_values) else: raise TypeError("Unexpected type {}".format(type(l_values))) + + assert l_values.dtype == dtype + assert r_values.dtype == dtype + + +def test_values_periodindex(): + arr = pd.period_range("2017", periods=4, freq='D') + result = arr._values + expected = np.array(arr.astype(object)) + tm.assert_numpy_array_equal(result, expected) From 9fbac2959dc34f64133b44fa8274189abcc07655 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 13:55:34 -0600 Subject: [PATCH 08/36] More tests --- pandas/tests/test_base.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 0dbced114ce51..94449663b580b 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1216,10 +1216,11 @@ def test_unique_datetime_series(arr, expected): (np.array(['a', 'b']), np.ndarray, 'object'), (pd.Categorical(['a', 'b']), pd.Categorical, 'category'), (pd.DatetimeIndex(['2017', '2018']), np.ndarray, 'datetime64[ns]'), - (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray, 'object'), - (pd.IntervalIndex.from_breaks([0, 1, 2]), np.ndarray, 'object'), (pd.DatetimeIndex(['2017', '2018'], tz="US/Central"), pd.DatetimeIndex, 'datetime64[ns, US/Central]'), + (pd.TimedeltaIndex([10**10]), np.ndarray, 'm8[ns]'), + (pd.PeriodIndex([2018, 2019], freq='A'), np.ndarray, 'object'), + (pd.IntervalIndex.from_breaks([0, 1, 2]), np.ndarray, 'object'), ]) def test_values_consistent(array, expected_type, dtype): l_values = pd.Series(array)._values @@ -1245,3 +1246,24 @@ def test_values_periodindex(): result = arr._values expected = np.array(arr.astype(object)) tm.assert_numpy_array_equal(result, expected) + + +@pytest.mark.parametrize('array, expected', [ + (np.array([0, 1]), np.array([0, 1])), + (np.array(['0', '1']), np.array(['0', '1'], dtype=object)), + (pd.Categorical(['a', 'a']), np.array([0, 0], dtype='int8')), + (pd.DatetimeIndex(['2017-01-01T00:00:00']), + np.array(['2017-01-01T00:00:00'], dtype='M8[ns]')), + (pd.DatetimeIndex(['2017-01-01T00:00:00'], tz="US/Eastern"), + np.array(['2017-01-01T05:00:00'], dtype='M8[ns]')), + (pd.TimedeltaIndex([10**10]), np.array([10**10], dtype='m8[ns]')), + pytest.mark.xfail(reason='PeriodArray not implemented')(( + pd.PeriodIndex(['2017', '2018'], freq='D'), + np.array([17167, 17532]), + )), +]) +def test_ndarray_values(array, expected): + l_values = pd.Series(array)._ndarray_values + r_values = pd.Index(array)._ndarray_values + tm.assert_numpy_array_equal(l_values, r_values) + tm.assert_numpy_array_equal(l_values, expected) From 55305dc197cf7444aa50eab3ba426d5b7244672a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 14:29:08 -0600 Subject: [PATCH 09/36] ndarray_values --- pandas/core/base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/core/base.py b/pandas/core/base.py index 62f237e253c96..dd950a7b8ff00 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -772,6 +772,11 @@ def base(self): def _ndarray_values(self): """The data as an ndarray. See '_values' for more.""" # type: () -> np.ndarray + from pandas.core.dtypes.common import is_categorical_dtype + + if is_categorical_dtype(self): + return self._values.codes + return self.values @property From 0e637086e1e89ed7c580e5b731b030d524431a34 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 15:01:28 -0600 Subject: [PATCH 10/36] API: Default ExtensionArray.astype (cherry picked from commit 943a915562b72bed147c857de927afa0daf31c1a) (cherry picked from commit fbf0a0672380e210d3cb3c527fa8045a204d81be) --- pandas/core/arrays/base.py | 30 +++++++++++++++++ pandas/tests/extension_arrays/test_common.py | 34 ++++++++++++++++++++ 2 files changed, 64 insertions(+) create mode 100644 pandas/tests/extension_arrays/test_common.py diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1556b653819a6..8c3d033dffba7 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1,4 +1,6 @@ """An interface for extending pandas with custom arrays.""" +import numpy as np + from pandas.errors import AbstractMethodError _not_implemented_message = "{} does not implement {}." @@ -138,6 +140,34 @@ def nbytes(self): # ------------------------------------------------------------------------ # Additional Methods # ------------------------------------------------------------------------ + def astype(self, dtype, copy=True): + """Cast to a NumPy array with 'dtype'. + + The default implementation only allows casting to 'object' dtype. + + Parameters + ---------- + dtype : str or dtype + Typecode or data-type to which the array is cast. + copy : bool, default True + Whether to copy the data, even if not necessary. If False, + a copy is made only if the old dtype does not match the + new dtype. + + Returns + ------- + array : ndarray + NumPy ndarray with 'dtype' for its dtype. + """ + np_dtype = np.dtype(dtype) + + if np_dtype != 'object': + msg = ("{} can only be coerced to 'object' dtype, " + "not '{}'.").format(type(self).__name__, dtype) + raise ValueError(msg) + + return np.array(self, dtype=np_dtype, copy=copy) + def isna(self): # type: () -> np.ndarray """Boolean NumPy array indicating if each value is missing. diff --git a/pandas/tests/extension_arrays/test_common.py b/pandas/tests/extension_arrays/test_common.py new file mode 100644 index 0000000000000..7feb7fdf09ec6 --- /dev/null +++ b/pandas/tests/extension_arrays/test_common.py @@ -0,0 +1,34 @@ +import numpy as np + +import pandas.util.testing as tm +from pandas.core.arrays import ExtensionArray + + +class DummyArray(ExtensionArray): + + def __init__(self, data): + self.data = data + + def __array__(self, dtype): + return self.data + + +def test_astype(): + arr = DummyArray(np.array([1, 2, 3])) + expected = np.array([1, 2, 3], dtype=object) + + result = arr.astype(object) + tm.assert_numpy_array_equal(result, expected) + + result = arr.astype('object') + tm.assert_numpy_array_equal(result, expected) + + +def test_astype_raises(): + arr = DummyArray(np.array([1, 2, 3])) + + xpr = ("DummyArray can only be coerced to 'object' dtype, not " + "''") + + with tm.assert_raises_regex(ValueError, xpr): + arr.astype(int) From fbbbc8a08b9bfe66cbe06621795163d65dbd3c77 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 15:22:43 -0600 Subject: [PATCH 11/36] Simplify concat_as_object --- pandas/core/dtypes/concat.py | 10 +++++++--- pandas/tests/indexes/test_category.py | 8 ++++++++ 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index d6b55d03ebccd..b36dc03bbc82b 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -488,8 +488,11 @@ def _concat_index_asobject(to_concat, name=None): concat all inputs as object. DatetimeIndex, TimedeltaIndex and PeriodIndex are converted to object dtype before concatenation """ + from pandas import Index + from pandas.core.arrays import ExtensionArray - klasses = ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex + klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex, + ExtensionArray) to_concat = [x.astype(object) if isinstance(x, klasses) else x for x in to_concat] @@ -497,8 +500,9 @@ def _concat_index_asobject(to_concat, name=None): attribs = self._get_attributes_dict() attribs['name'] = name - arrays = [np.array(x, copy=False, dtype=object) for x in to_concat] - return self._shallow_copy_with_infer(np.concatenate(arrays), **attribs) + to_concat = [x._values if isinstance(x, Index) else x + for x in to_concat] + return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs) def _concat_sparse(to_concat, axis=0, typs=None): diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index c2e40c79f8914..e9fddfde90348 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -353,6 +353,14 @@ def test_append(self): expected = Index(list('caaabbca')) tm.assert_index_equal(result, expected, exact=True) + def test_append_to_another(self): + # hits _concat_index_asobject + fst = Index(['a', 'b']) + snd = CategoricalIndex(['d', 'e']) + result = fst.append(snd) + expected = Index(['a', 'b', 'd', 'e']) + tm.assert_index_equal(result, expected) + def test_insert(self): ci = self.create_index() From 46a0a49352a1242077e616056f802b0ce35eb8d9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 15:46:36 -0600 Subject: [PATCH 12/36] Py2 compat (cherry picked from commit b20e12cae68dd86ff51597464045656763d369f7) --- pandas/tests/extension_arrays/test_common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension_arrays/test_common.py b/pandas/tests/extension_arrays/test_common.py index 7feb7fdf09ec6..f19754482b04f 100644 --- a/pandas/tests/extension_arrays/test_common.py +++ b/pandas/tests/extension_arrays/test_common.py @@ -27,8 +27,10 @@ def test_astype(): def test_astype_raises(): arr = DummyArray(np.array([1, 2, 3])) + # type int for py2 + # class int for py3 xpr = ("DummyArray can only be coerced to 'object' dtype, not " - "''") + "'<.* 'int'>'") with tm.assert_raises_regex(ValueError, xpr): arr.astype(int) From 2c4445a365d19979b400295ce6a7c671396cb0da Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 16:30:11 -0600 Subject: [PATCH 13/36] Set-ops ugliness --- pandas/core/indexes/base.py | 52 +++++++++++++-------------------- pandas/core/indexes/category.py | 6 ---- 2 files changed, 21 insertions(+), 37 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 260016661a735..3ce3ecce1c140 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -31,12 +31,14 @@ is_object_dtype, is_categorical_dtype, is_interval_dtype, + is_period_dtype, is_bool, is_bool_dtype, is_signed_integer_dtype, is_unsigned_integer_dtype, is_integer_dtype, is_float_dtype, is_datetime64_any_dtype, + is_datetime64tz_dtype, is_timedelta64_dtype, needs_i8_conversion, is_iterator, is_list_like, @@ -2252,15 +2254,15 @@ def union(self, other): other = other.astype('O') return this.union(other) - if is_categorical_dtype(self): - lvals = self.values - else: + # TODO: setops-refactor, clean all this up + if is_period_dtype(self) or is_datetime64tz_dtype(self): lvals = self._ndarray_values - - if is_categorical_dtype(other): - rvals = other.values else: + lvals = self._values + if is_period_dtype(other) or is_datetime64tz_dtype(other): rvals = other._ndarray_values + else: + rvals = other._values if self.is_monotonic and other.is_monotonic: try: @@ -2310,24 +2312,6 @@ def _wrap_union_result(self, other, result): name = self.name if self.name == other.name else None return self.__class__(result, name=name) - def _ensure_join(self, values): - """Ensure that the 'values' are ready for our join indexer. - - The default join indexers are object, so this just returns 'values'. - This is called before calling those. - - - Parameters - ---------- - values : array-like - - Returns - ------- - values : ndarray - Expected to have the correct type for self.inner_indexer - """ - return values - def intersection(self, other): """ Form the intersection of two Index objects. @@ -2363,24 +2347,30 @@ def intersection(self, other): other = other.astype('O') return this.intersection(other) + # TODO: setops-refactor, clean all this up + if is_period_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_period_dtype(other): + rvals = other._ndarray_values + else: + rvals = other._values + if self.is_monotonic and other.is_monotonic: try: - lvals = self._ensure_join(self._ndarray_values) - rvals = self._ensure_join(other._ndarray_values) result = self._inner_indexer(lvals, rvals)[0] return self._wrap_union_result(other, result) except TypeError: pass try: - indexer = Index(other._ndarray_values).get_indexer( - self._ndarray_values) + indexer = Index(rvals).get_indexer(lvals) indexer = indexer.take((indexer != -1).nonzero()[0]) except Exception: - # duplicates + # duplicateters indexer = algos.unique1d( - Index(other._ndarray_values).get_indexer_non_unique( - self._ndarray_values)[0]) + Index(rvals).get_indexer_non_unique(lvals)[0]) indexer = indexer[indexer != -1] taken = other.take(indexer) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 4381b35f6cb86..93ed2507cb829 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -805,12 +805,6 @@ def _delegate_method(self, name, *args, **kwargs): return res return CategoricalIndex(res, name=self.name) - def _ensure_join(self, values): - if self.codes.dtype.itemsize <= 4: - return _ensure_int32(values) - else: - return _ensure_int64(values) - @classmethod def _add_accessors(cls): """ add in Categorical accessor methods """ From 5612cda29f77b5865df92bb97c6e7a2abde6bcb6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 16:46:02 -0600 Subject: [PATCH 14/36] better docstrings --- pandas/core/base.py | 9 ++++++++- pandas/core/indexes/base.py | 27 ++++++++++++++++++--------- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index dd950a7b8ff00..744d448b16682 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -770,7 +770,14 @@ def base(self): @property def _ndarray_values(self): - """The data as an ndarray. See '_values' for more.""" + """The data as an ndarray, possibly losing information. + + The expectation is that this is cheap to compute. + + - categorical -> codes + + See '_values' for more. + """ # type: () -> np.ndarray from pandas.core.dtypes.common import is_categorical_dtype diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3ce3ecce1c140..afefa5de2477e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -603,15 +603,24 @@ def _values(self): """The best array representation. This is an ndarray, ExtensionArray, or Index subclass. This differs - from '._ndarray_values', which always returns an ndarray. It may differ - from the public '.values' - - index | values | _values - ----------------- | -------------- -| ---------- - CategoricalIndex | Categorical | Categorical - DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] - PeriodIndex | ndarray[Period] | ndarray[Pd] (soon PeriodArray) - IntervalIndex | ndarray[IV] | ndarray[IV] (soon IntervalArray) + from ``_ndarray_values``, which always returns an ndarray. + + Both ``_values`` and ``_ndarray_values`` are consistent between + ``Series`` and ``Index``. + + It may differ from the public '.values' method. + + index | values | _values | _ndarray_values | + ----------------- | -------------- -| ----------- | --------------- | + CategoricalIndex | Categorical | Categorical | codes | + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | datetime@UTC | + + In the near-future, we'll implement two more. + + index | values | _values | _ndarray_values | + ----------------- | --------------- | ----------- | --------------- | + PeriodIndex | ndarray[object] | PeriodArray | ordinals | + IntervalIndex | ndarray[object] | IVArray | ndarray[object] | See Also -------- From b012c1967b6de548b999514fe4b560ba9b7ee635 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 17:03:15 -0600 Subject: [PATCH 15/36] tolist --- pandas/core/base.py | 3 +++ pandas/core/indexes/base.py | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 744d448b16682..f3b0fb9dbe142 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -14,6 +14,7 @@ is_list_like, is_scalar, is_datetimelike, + is_categorical_dtype, is_extension_type) from pandas.util._validators import validate_bool_kwarg @@ -833,6 +834,8 @@ def tolist(self): if is_datetimelike(self): return [com._maybe_box_datetimelike(x) for x in self._values] + elif is_categorical_dtype(self): + return self.values.tolist() else: return self._ndarray_values.tolist() diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index afefa5de2477e..9eb0ac1276280 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -617,10 +617,10 @@ def _values(self): In the near-future, we'll implement two more. - index | values | _values | _ndarray_values | - ----------------- | --------------- | ----------- | --------------- | - PeriodIndex | ndarray[object] | PeriodArray | ordinals | - IntervalIndex | ndarray[object] | IVArray | ndarray[object] | + index | values | _values | ndarray_values | + ----------------- | --------------- | ----------- | -------------- | + PeriodIndex | ndarray[object] | PeriodArray + IntervalIndex | IntervalArray | ndarray[Interval] See Also -------- From d49e6aa649a0b02ce612b9d18b663668ade6485a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Feb 2018 17:05:46 -0600 Subject: [PATCH 16/36] linting --- pandas/core/indexes/datetimes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 788005531efe1..22ce690b3d420 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -1105,7 +1105,8 @@ def unique(self, level=None): else: naive = self result = super(DatetimeIndex, naive).unique(level=level) - return self._simple_new(result, name=self.name, tz=self.tz, freq=self.freq) + return self._simple_new(result, name=self.name, tz=self.tz, + freq=self.freq) def union(self, other): """ From d7d31eecc1411f9d68755bd86f80b2a97a34776e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 08:21:51 -0600 Subject: [PATCH 17/36] Moved dtypes (cherry picked from commit d1362271bca8a7b183f3241e5c2f040c422118b8) --- pandas/tests/dtypes/test_dtypes.py | 32 +------------------- pandas/tests/extension_arrays/test_common.py | 29 ++++++++++++++++++ 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index eca4dd4cf2106..d800a7b92b559 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -10,14 +10,12 @@ Series, Categorical, CategoricalIndex, IntervalIndex, date_range) from pandas.compat import string_types -from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, PeriodDtype, - IntervalDtype, CategoricalDtype, ExtensionDtype) + IntervalDtype, CategoricalDtype) from pandas.core.dtypes.common import ( is_categorical_dtype, is_categorical, is_datetime64tz_dtype, is_datetimetz, - is_extension_array_dtype, is_period_dtype, is_period, is_dtype_equal, is_datetime64_ns_dtype, is_datetime64_dtype, is_interval_dtype, @@ -744,31 +742,3 @@ def test_categorical_categories(self): tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) c1 = CategoricalDtype(CategoricalIndex(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) - - -class DummyArray(ExtensionArray): - pass - - -class DummyDtype(ExtensionDtype): - pass - - -class TestExtensionArrayDtype(object): - - @pytest.mark.parametrize('values', [ - pd.Categorical([]), - pd.Categorical([]).dtype, - pd.Series(pd.Categorical([])), - DummyDtype(), - DummyArray(), - ]) - def test_is_extension_array_dtype(self, values): - assert is_extension_array_dtype(values) - - @pytest.mark.parametrize('values', [ - np.array([]), - pd.Series(np.array([])), - ]) - def test_is_not_extension_array_dtype(self, values): - assert not is_extension_array_dtype(values) diff --git a/pandas/tests/extension_arrays/test_common.py b/pandas/tests/extension_arrays/test_common.py index f19754482b04f..1fc4526aff951 100644 --- a/pandas/tests/extension_arrays/test_common.py +++ b/pandas/tests/extension_arrays/test_common.py @@ -1,7 +1,15 @@ import numpy as np +import pytest +import pandas as pd import pandas.util.testing as tm from pandas.core.arrays import ExtensionArray +from pandas.core.dtypes.common import is_extension_array_dtype +from pandas.core.dtypes.dtypes import ExtensionDtype + + +class DummyDtype(ExtensionDtype): + pass class DummyArray(ExtensionArray): @@ -13,7 +21,28 @@ def __array__(self, dtype): return self.data +class TestExtensionArrayDtype(object): + + @pytest.mark.parametrize('values', [ + pd.Categorical([]), + pd.Categorical([]).dtype, + pd.Series(pd.Categorical([])), + DummyDtype(), + DummyArray(np.array([1, 2])), + ]) + def test_is_extension_array_dtype(self, values): + assert is_extension_array_dtype(values) + + @pytest.mark.parametrize('values', [ + np.array([]), + pd.Series(np.array([])), + ]) + def test_is_not_extension_array_dtype(self, values): + assert not is_extension_array_dtype(values) + + def test_astype(): + arr = DummyArray(np.array([1, 2, 3])) expected = np.array([1, 2, 3], dtype=object) From 7b89f1b3dc80c23d02c8b57c9c5d94cd491082c8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 08:36:44 -0600 Subject: [PATCH 18/36] clean --- pandas/tests/extension_arrays/test_common.py | 65 -------------------- 1 file changed, 65 deletions(-) delete mode 100644 pandas/tests/extension_arrays/test_common.py diff --git a/pandas/tests/extension_arrays/test_common.py b/pandas/tests/extension_arrays/test_common.py deleted file mode 100644 index 1fc4526aff951..0000000000000 --- a/pandas/tests/extension_arrays/test_common.py +++ /dev/null @@ -1,65 +0,0 @@ -import numpy as np -import pytest - -import pandas as pd -import pandas.util.testing as tm -from pandas.core.arrays import ExtensionArray -from pandas.core.dtypes.common import is_extension_array_dtype -from pandas.core.dtypes.dtypes import ExtensionDtype - - -class DummyDtype(ExtensionDtype): - pass - - -class DummyArray(ExtensionArray): - - def __init__(self, data): - self.data = data - - def __array__(self, dtype): - return self.data - - -class TestExtensionArrayDtype(object): - - @pytest.mark.parametrize('values', [ - pd.Categorical([]), - pd.Categorical([]).dtype, - pd.Series(pd.Categorical([])), - DummyDtype(), - DummyArray(np.array([1, 2])), - ]) - def test_is_extension_array_dtype(self, values): - assert is_extension_array_dtype(values) - - @pytest.mark.parametrize('values', [ - np.array([]), - pd.Series(np.array([])), - ]) - def test_is_not_extension_array_dtype(self, values): - assert not is_extension_array_dtype(values) - - -def test_astype(): - - arr = DummyArray(np.array([1, 2, 3])) - expected = np.array([1, 2, 3], dtype=object) - - result = arr.astype(object) - tm.assert_numpy_array_equal(result, expected) - - result = arr.astype('object') - tm.assert_numpy_array_equal(result, expected) - - -def test_astype_raises(): - arr = DummyArray(np.array([1, 2, 3])) - - # type int for py2 - # class int for py3 - xpr = ("DummyArray can only be coerced to 'object' dtype, not " - "'<.* 'int'>'") - - with tm.assert_raises_regex(ValueError, xpr): - arr.astype(int) From b0dbffd72376d88bfc1dd8d4d89c890978686d4e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 09:34:39 -0600 Subject: [PATCH 19/36] cleanup --- pandas/core/indexes/base.py | 10 +++--- pandas/core/indexes/category.py | 2 +- pandas/core/indexes/multi.py | 2 +- pandas/tests/test_base.py | 56 ++++++++++++++++++++++++++++----- 4 files changed, 56 insertions(+), 14 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 9eb0ac1276280..d8b4a65a91ecc 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -613,14 +613,14 @@ def _values(self): index | values | _values | _ndarray_values | ----------------- | -------------- -| ----------- | --------------- | CategoricalIndex | Categorical | Categorical | codes | - DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | datetime@UTC | + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | In the near-future, we'll implement two more. - index | values | _values | ndarray_values | - ----------------- | --------------- | ----------- | -------------- | - PeriodIndex | ndarray[object] | PeriodArray - IntervalIndex | IntervalArray | ndarray[Interval] + index | values | _values | _ndarray_values | + ----------------- | --------------- | ----------- | --------------- | + PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | + IntervalIndex | ndarray[object] | PeriodArray | ndarray[object] | See Also -------- diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 93ed2507cb829..166832cbe6bb1 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -797,7 +797,7 @@ def _evaluate_compare(self, other): def _delegate_method(self, name, *args, **kwargs): """ method delegation to the ._values """ - method = getattr(self.values, name) + method = getattr(self._values, name) if 'inplace' in kwargs: raise ValueError("cannot use inplace with CategoricalIndex") res = method(*args, **kwargs) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1478012aa9dbe..a257a1ba26128 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -799,7 +799,7 @@ def values(self): box = hasattr(lev, '_box_values') # Try to minimize boxing. if box and len(lev) > len(lab): - taken = lev._box_values(algos.take_1d(lev._values, + taken = lev._box_values(algos.take_1d(lev._ndarray_values, lab)) elif box: taken = algos.take_1d(lev._box_values(lev._ndarray_values), diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 94449663b580b..66ec2d37c680e 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1241,13 +1241,6 @@ def test_values_consistent(array, expected_type, dtype): assert r_values.dtype == dtype -def test_values_periodindex(): - arr = pd.period_range("2017", periods=4, freq='D') - result = arr._values - expected = np.array(arr.astype(object)) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize('array, expected', [ (np.array([0, 1]), np.array([0, 1])), (np.array(['0', '1']), np.array(['0', '1'], dtype=object)), @@ -1267,3 +1260,52 @@ def test_ndarray_values(array, expected): r_values = pd.Index(array)._ndarray_values tm.assert_numpy_array_equal(l_values, r_values) tm.assert_numpy_array_equal(l_values, expected) + + +def test_values_multiindex_datetimesindex(): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(10**18, 10**18 + 5) + naive = pd.DatetimeIndex(ints) + aware = pd.DatetimeIndex(ints, tz='US/Central') + + idx = pd.MultiIndex.from_arrays([naive, aware]) + result = idx.values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive[:2]) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware[:2]) + + +def test_values_multiindex_datetimesindex(): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(2007, 2012) + pidx = pd.PeriodIndex(ints, freq='D') + + idx = pd.MultiIndex.from_arrays([ints, pidx]) + result = idx.values + + outer = pd.Int64Index([x[0] for x in result]) + tm.assert_index_equal(outer, pd.Int64Index(ints)) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.Int64Index([x[0] for x in result]) + tm.assert_index_equal(outer, pd.Int64Index(ints[:2])) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx[:2]) From 66b936f00b72e3152df807e6e5913f1111084cef Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 09:42:37 -0600 Subject: [PATCH 20/36] NumPy compat --- pandas/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 66ec2d37c680e..e649667e3dda1 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1196,7 +1196,7 @@ def test_unique_datetime_index(arr, expected): @pytest.mark.parametrize('arr, expected', [ (pd.Series(pd.DatetimeIndex(['2017', '2017'])), - np.array(['2017'], dtype='M8[ns]')), + np.array(['2017-01-01T00:00:00'], dtype='M8[ns]')), (pd.Series(pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern')), np.array([pd.Timestamp('2017', tz="US/Eastern")], dtype=object)), ]) From 32ee0eff6893bd02ed1469330054b0c37914306e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 10:10:15 -0600 Subject: [PATCH 21/36] Use base _values for CategoricalIndex --- pandas/core/indexes/category.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 166832cbe6bb1..f03f8571121f0 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -304,10 +304,6 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data - @property - def _values(self): - return self._data - @property def _ndarray_values(self): return self._data.codes From a9882e23defc47272f941932c4ce53af9b5ba0e6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 10:10:34 -0600 Subject: [PATCH 22/36] Update dev docs --- doc/source/internals.rst | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/doc/source/internals.rst b/doc/source/internals.rst index 29aaed318b802..957f82fd9eba7 100644 --- a/doc/source/internals.rst +++ b/doc/source/internals.rst @@ -92,16 +92,20 @@ if you compute the levels and labels yourself, please be careful. Values ~~~~~~ -Pandas extends NumPy's type system in a few places, so we have multiple notions of "values" floating around. -For 1-D containers (``Index`` classes and ``Series``) we have the following convention: - -* ``cls._ndarray_values`` is *always* and ``ndarray`` -* ``cls._values`` refers is the "best possible" array. This could be an ``ndarray``, ``ExtensionArray``, or - in ``Index`` subclass (note: we're in the process of removing the index subclasses here so that it's - always an ``ndarray`` or ``ExtensionArray``). - -So, for example, ``Series[category]._values`` is a ``Categorical``, while ``Series[category]._ndarray_values`` is -the underlying ndarray. +Pandas extends NumPy's type system with custom types, like ``Categorical`` or +datetimes with a timezone, so we have multiple notions of "values". For 1-D +containers (``Index`` classes and ``Series``) we have the following convention: + +* ``cls._ndarray_values`` is *always* a NumPy ``ndarray``. Ideally, + ``_ndarray_values`` is cheap to compute. For example, for a ``Categorical``, + this returns the codes, not the array of objects. +* ``cls._values`` refers is the "best possible" array. This could be an + ``ndarray``, ``ExtensionArray``, or in ``Index`` subclass (note: we're in the + process of removing the index subclasses here so that it's always an + ``ndarray`` or ``ExtensionArray``). + +So, for example, ``Series[category]._values`` is a ``Categorical``, while +``Series[category]._ndarray_values`` is the underlying codes. .. _ref-subclassing-pandas: From 242562108b099b4e7a205541ee15b9272dcb5265 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 10:59:22 -0600 Subject: [PATCH 23/36] cleanup --- pandas/core/dtypes/cast.py | 2 +- pandas/core/indexes/category.py | 13 +------------ pandas/core/indexes/multi.py | 8 +++----- pandas/core/indexes/period.py | 5 ----- 4 files changed, 5 insertions(+), 23 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index b2816343fc8eb..55919fb2bea0d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -927,7 +927,7 @@ def try_timedelta(v): # will try first with a string & object conversion from pandas import to_timedelta try: - return to_timedelta(v)._values.reshape(shape) + return to_timedelta(v)._ndarray_values.reshape(shape) except Exception: return v.reshape(shape) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index f03f8571121f0..5aa940499a368 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -1,6 +1,5 @@ import numpy as np from pandas._libs import index as libindex -from pandas._libs import join as libjoin from pandas import compat from pandas.compat.numpy import function as nv @@ -9,8 +8,6 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, _ensure_platform_int, - _ensure_int32, - _ensure_int64, is_list_like, is_interval_dtype, is_scalar) @@ -217,14 +214,6 @@ def _shallow_copy(self, values=None, categories=None, ordered=None, values=values, categories=categories, ordered=ordered, **kwargs) - @cache_readonly - def _inner_indexer(self): - if self.codes.dtype.itemsize <= 4: - # int8, int16, int32 - return libjoin.inner_join_indexer_int32 - else: - return libjoin.inner_join_indexer_int64 - def _is_dtype_compat(self, other): """ *this is an internal non-public method* @@ -238,7 +227,7 @@ def _is_dtype_compat(self, other): """ if is_categorical_dtype(other): if isinstance(other, CategoricalIndex): - other = other.values + other = other._values if not other.is_dtype_equal(self): raise TypeError("categories must match existing categories " "when appending") diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index a257a1ba26128..907bbb2e8762e 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -2507,6 +2507,7 @@ def get_locs(self, seq): MultiIndex.slice_locs : Get slice location given start label(s) and end label(s). """ + from .numeric import Int64Index # must be lexsorted to at least as many levels true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s] @@ -2532,7 +2533,6 @@ def _convert_to_indexer(r): "that is not the same length as the " "index") r = r.nonzero()[0] - from .numeric import Int64Index return Int64Index(r) def _update_indexer(idxr, indexer=indexer): @@ -2569,7 +2569,6 @@ def _update_indexer(idxr, indexer=indexer): if indexers is not None: indexer = _update_indexer(indexers, indexer=indexer) else: - from .numeric import Int64Index # no matches we are done return Int64Index([])._ndarray_values @@ -2652,9 +2651,8 @@ def equals(self, other): for i in range(self.nlevels): slabels = self.labels[i] slabels = slabels[slabels != -1] - svalues = algos.take_nd( - np.asarray(self.levels[i]._values), - slabels, allow_fill=False) + svalues = algos.take_nd(np.asarray(self.levels[i]._values), + slabels, allow_fill=False) olabels = other.labels[i] olabels = olabels[olabels != -1] diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index c8b7d6063e378..e90d3827fe84e 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -418,11 +418,6 @@ def _int64index(self): def values(self): return self.astype(object).values - @property - def _values(self): - # TODO: return PeriodArray - return self.values - @property def _ndarray_values(self): # Ordinals From 170d0c7959a54276fff730b002195f46ec64de63 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 13:33:49 -0600 Subject: [PATCH 24/36] Linting --- pandas/core/base.py | 3 +-- pandas/tests/test_base.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index f3b0fb9dbe142..01dba132e00c5 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -7,8 +7,7 @@ import numpy as np from pandas.core.dtypes.missing import isna -from pandas.core.dtypes.generic import ( - ABCDataFrame, ABCSeries, ABCIndexClass, ABCDatetimeIndex) +from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass from pandas.core.dtypes.common import ( is_object_dtype, is_list_like, diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index e649667e3dda1..31fa278f906f5 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1262,7 +1262,7 @@ def test_ndarray_values(array, expected): tm.assert_numpy_array_equal(l_values, expected) -def test_values_multiindex_datetimesindex(): +def test_values_multiindex_datetimeindex(): # Test to ensure we hit the boxing / nobox part of MI.values ints = np.arange(10**18, 10**18 + 5) naive = pd.DatetimeIndex(ints) @@ -1287,7 +1287,7 @@ def test_values_multiindex_datetimesindex(): tm.assert_index_equal(inner, aware[:2]) -def test_values_multiindex_datetimesindex(): +def test_values_multiindex_periodindex(): # Test to ensure we hit the boxing / nobox part of MI.values ints = np.arange(2007, 2012) pidx = pd.PeriodIndex(ints, freq='D') From 402620f3ca75d14dd203f809226ec528113ae54c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Feb 2018 13:35:24 -0600 Subject: [PATCH 25/36] Precision in tests --- pandas/tests/test_base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 31fa278f906f5..ce1e3d492741d 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1212,7 +1212,7 @@ def test_unique_datetime_series(arr, expected): @pytest.mark.parametrize('array, expected_type, dtype', [ - (np.array([0, 1]), np.ndarray, 'int64'), + (np.array([0, 1], dtype=np.int64), np.ndarray, 'int64'), (np.array(['a', 'b']), np.ndarray, 'object'), (pd.Categorical(['a', 'b']), pd.Categorical, 'category'), (pd.DatetimeIndex(['2017', '2018']), np.ndarray, 'datetime64[ns]'), @@ -1242,7 +1242,7 @@ def test_values_consistent(array, expected_type, dtype): @pytest.mark.parametrize('array, expected', [ - (np.array([0, 1]), np.array([0, 1])), + (np.array([0, 1], dtype=np.int64), np.array([0, 1], dtype=np.int64)), (np.array(['0', '1']), np.array(['0', '1'], dtype=object)), (pd.Categorical(['a', 'a']), np.array([0, 0], dtype='int8')), (pd.DatetimeIndex(['2017-01-01T00:00:00']), From 815d202e96e910a64a292f6815737447ffdc1847 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 11 Feb 2018 14:13:50 -0600 Subject: [PATCH 26/36] Push _ndarray_values to ExtensionArray Now IndexOpsMixin._ndarray_values will dispatch all the way down to the EA. Subclasses like Categorical can override it as they see fit. --- pandas/core/arrays/base.py | 12 ++++++++++++ pandas/core/arrays/categorical.py | 4 ++++ pandas/core/base.py | 15 ++++++--------- pandas/core/dtypes/common.py | 2 +- pandas/core/indexes/category.py | 4 ---- 5 files changed, 23 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 553e1e0ac2066..e618dc6b69b2d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -266,3 +266,15 @@ def _can_hold_na(self): Setting this to false will optimize some operations like fillna. """ return True + + @property + def _ndarray_values(self): + # type: () -> np.ndarray + """Internal pandas method for lossy conversion to a NumPy ndarray. + + This method is not part of the pandas interface. + + The expectation is that this is cheap to compute, and is primarily + used for interacting with our indexers. + """ + return np.array(self) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 62c6a6b16cbe9..8d2cf9d2b2f92 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -410,6 +410,10 @@ def dtype(self): """The :class:`~pandas.api.types.CategoricalDtype` for this instance""" return self._dtype + @property + def _ndarray_values(self): + return self.codes + @property def _constructor(self): return Categorical diff --git a/pandas/core/base.py b/pandas/core/base.py index 01dba132e00c5..0e70e3eb64fcb 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -14,7 +14,8 @@ is_scalar, is_datetimelike, is_categorical_dtype, - is_extension_type) + is_extension_type, + is_extension_array_dtype) from pandas.util._validators import validate_bool_kwarg @@ -772,18 +773,14 @@ def base(self): def _ndarray_values(self): """The data as an ndarray, possibly losing information. - The expectation is that this is cheap to compute. + The expectation is that this is cheap to compute, and is primarily + used for interacting with our indexers. - categorical -> codes - - See '_values' for more. """ # type: () -> np.ndarray - from pandas.core.dtypes.common import is_categorical_dtype - - if is_categorical_dtype(self): - return self._values.codes - + if is_extension_array_dtype(self): + return self.values._ndarray_values return self.values @property diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c66e7fcfc6978..c2b71bc316fe8 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1709,7 +1709,7 @@ def is_extension_array_dtype(arr_or_dtype): from pandas.core.arrays import ExtensionArray # we want to unpack series, anything else? - if isinstance(arr_or_dtype, ABCSeries): + if isinstance(arr_or_dtype, (ABCIndexClass, ABCSeries)): arr_or_dtype = arr_or_dtype._values return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray)) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 5aa940499a368..d71b7ea774f52 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -293,10 +293,6 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data - @property - def _ndarray_values(self): - return self._data.codes - @property def itemsize(self): return self.values.itemsize From a727b217f42e959f9ebb355e911f3ec641db0b49 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 11 Feb 2018 14:27:46 -0600 Subject: [PATCH 27/36] Clean up tolist --- pandas/core/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 0e70e3eb64fcb..0b4c03d6b4b25 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -830,10 +830,8 @@ def tolist(self): if is_datetimelike(self): return [com._maybe_box_datetimelike(x) for x in self._values] - elif is_categorical_dtype(self): - return self.values.tolist() else: - return self._ndarray_values.tolist() + return self._values.tolist() def __iter__(self): """ From f368c29d6a45832f95181a8a6e8b7411d87763c7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 11 Feb 2018 14:33:46 -0600 Subject: [PATCH 28/36] Move test locations --- .../tests/indexes/datetimes/test_datetime.py | 15 ++++ pandas/tests/indexes/test_multi.py | 48 +++++++++++ pandas/tests/test_base.py | 82 ------------------- 3 files changed, 63 insertions(+), 82 deletions(-) diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index a75ace2933b71..e9176e749564e 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -469,3 +469,18 @@ def test_factorize_dst(self): arr, res = obj.factorize() tm.assert_numpy_array_equal(arr, np.arange(12, dtype=np.intp)) tm.assert_index_equal(res, idx) + + @pytest.mark.parametrize('arr, expected', [ + (pd.DatetimeIndex(['2017', '2017']), pd.DatetimeIndex(['2017'])), + (pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern'), + pd.DatetimeIndex(['2017'], tz='US/Eastern')), + ]) + def test_unique(self, arr, expected): + result = arr.unique() + + if isinstance(expected, np.ndarray): + tm.assert_numpy_array_equal(result, expected) + if isinstance(expected, pd.Series): + tm.assert_series_equal(result, expected) + if isinstance(expected, pd.DatetimeIndex): + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index e59456b8a2d5e..97370b279245c 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -962,6 +962,54 @@ def test_values_boxed(self): # Check that code branches for boxed values produce identical results tm.assert_numpy_array_equal(result.values[:4], result[:4].values) + def test_values_multiindex_datetimeindex(self): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(10**18, 10**18 + 5) + naive = pd.DatetimeIndex(ints) + aware = pd.DatetimeIndex(ints, tz='US/Central') + + idx = pd.MultiIndex.from_arrays([naive, aware]) + result = idx.values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.DatetimeIndex([x[0] for x in result]) + tm.assert_index_equal(outer, naive[:2]) + + inner = pd.DatetimeIndex([x[1] for x in result]) + tm.assert_index_equal(inner, aware[:2]) + + + def test_values_multiindex_periodindex(): + # Test to ensure we hit the boxing / nobox part of MI.values + ints = np.arange(2007, 2012) + pidx = pd.PeriodIndex(ints, freq='D') + + idx = pd.MultiIndex.from_arrays([ints, pidx]) + result = idx.values + + outer = pd.Int64Index([x[0] for x in result]) + tm.assert_index_equal(outer, pd.Int64Index(ints)) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx) + + # n_lev > n_lab + result = idx[:2].values + + outer = pd.Int64Index([x[0] for x in result]) + tm.assert_index_equal(outer, pd.Int64Index(ints[:2])) + + inner = pd.PeriodIndex([x[1] for x in result]) + tm.assert_index_equal(inner, pidx[:2]) + def test_append(self): result = self.index[:3].append(self.index[3:]) assert result.equals(self.index) diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index ce1e3d492741d..4b5ad336139b0 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -1178,39 +1178,6 @@ def test_iter_box(self): assert res == exp -@pytest.mark.parametrize('arr, expected', [ - (pd.DatetimeIndex(['2017', '2017']), pd.DatetimeIndex(['2017'])), - (pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern'), - pd.DatetimeIndex(['2017'], tz='US/Eastern')), -]) -def test_unique_datetime_index(arr, expected): - result = arr.unique() - - if isinstance(expected, np.ndarray): - tm.assert_numpy_array_equal(result, expected) - if isinstance(expected, pd.Series): - tm.assert_series_equal(result, expected) - if isinstance(expected, pd.DatetimeIndex): - tm.assert_index_equal(result, expected) - - -@pytest.mark.parametrize('arr, expected', [ - (pd.Series(pd.DatetimeIndex(['2017', '2017'])), - np.array(['2017-01-01T00:00:00'], dtype='M8[ns]')), - (pd.Series(pd.DatetimeIndex(['2017', '2017'], tz='US/Eastern')), - np.array([pd.Timestamp('2017', tz="US/Eastern")], dtype=object)), -]) -def test_unique_datetime_series(arr, expected): - result = arr.unique() - - if isinstance(expected, np.ndarray): - tm.assert_numpy_array_equal(result, expected) - if isinstance(expected, pd.Series): - tm.assert_series_equal(result, expected) - if isinstance(expected, pd.DatetimeIndex): - tm.assert_index_equal(result, expected) - - @pytest.mark.parametrize('array, expected_type, dtype', [ (np.array([0, 1], dtype=np.int64), np.ndarray, 'int64'), (np.array(['a', 'b']), np.ndarray, 'object'), @@ -1260,52 +1227,3 @@ def test_ndarray_values(array, expected): r_values = pd.Index(array)._ndarray_values tm.assert_numpy_array_equal(l_values, r_values) tm.assert_numpy_array_equal(l_values, expected) - - -def test_values_multiindex_datetimeindex(): - # Test to ensure we hit the boxing / nobox part of MI.values - ints = np.arange(10**18, 10**18 + 5) - naive = pd.DatetimeIndex(ints) - aware = pd.DatetimeIndex(ints, tz='US/Central') - - idx = pd.MultiIndex.from_arrays([naive, aware]) - result = idx.values - - outer = pd.DatetimeIndex([x[0] for x in result]) - tm.assert_index_equal(outer, naive) - - inner = pd.DatetimeIndex([x[1] for x in result]) - tm.assert_index_equal(inner, aware) - - # n_lev > n_lab - result = idx[:2].values - - outer = pd.DatetimeIndex([x[0] for x in result]) - tm.assert_index_equal(outer, naive[:2]) - - inner = pd.DatetimeIndex([x[1] for x in result]) - tm.assert_index_equal(inner, aware[:2]) - - -def test_values_multiindex_periodindex(): - # Test to ensure we hit the boxing / nobox part of MI.values - ints = np.arange(2007, 2012) - pidx = pd.PeriodIndex(ints, freq='D') - - idx = pd.MultiIndex.from_arrays([ints, pidx]) - result = idx.values - - outer = pd.Int64Index([x[0] for x in result]) - tm.assert_index_equal(outer, pd.Int64Index(ints)) - - inner = pd.PeriodIndex([x[1] for x in result]) - tm.assert_index_equal(inner, pidx) - - # n_lev > n_lab - result = idx[:2].values - - outer = pd.Int64Index([x[0] for x in result]) - tm.assert_index_equal(outer, pd.Int64Index(ints[:2])) - - inner = pd.PeriodIndex([x[1] for x in result]) - tm.assert_index_equal(inner, pidx[:2]) From d74c5c96040882378e3598e0df27e59aff57de51 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Feb 2018 06:33:05 -0600 Subject: [PATCH 29/36] Fixed test --- pandas/tests/indexes/test_multi.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 97370b279245c..cd6a5c761d0c2 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -986,8 +986,7 @@ def test_values_multiindex_datetimeindex(self): inner = pd.DatetimeIndex([x[1] for x in result]) tm.assert_index_equal(inner, aware[:2]) - - def test_values_multiindex_periodindex(): + def test_values_multiindex_periodindex(self): # Test to ensure we hit the boxing / nobox part of MI.values ints = np.arange(2007, 2012) pidx = pd.PeriodIndex(ints, freq='D') From 8104ee5d8a887454fec6869eb1f4e63fe74d72e6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Feb 2018 08:40:56 -0600 Subject: [PATCH 30/36] REF: Update per comments --- pandas/core/base.py | 2 +- pandas/core/dtypes/concat.py | 2 +- pandas/core/indexes/category.py | 6 +----- pandas/core/indexes/multi.py | 2 +- pandas/io/formats/format.py | 2 +- pandas/tests/indexes/datetimes/test_datetime.py | 8 +------- 6 files changed, 6 insertions(+), 16 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 0b4c03d6b4b25..8081e20faaeb3 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -745,7 +745,7 @@ def itemsize(self): @property def nbytes(self): """ return the number of bytes in the underlying data """ - return self._ndarray_values.nbytes + return self.values.nbytes @property def strides(self): diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index b36dc03bbc82b..d306d0d78f1f4 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -480,7 +480,7 @@ def _concat_datetimetz(to_concat, name=None): def _concat_index_same_dtype(indexes, klass=None): klass = klass if klass is not None else indexes[0].__class__ - return klass(np.concatenate([x._ndarray_values for x in indexes])) + return klass(np.concatenate([x._values for x in indexes])) def _concat_index_asobject(to_concat, name=None): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d71b7ea774f52..7d4a864b465e8 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -295,13 +295,9 @@ def values(self): @property def itemsize(self): + # Size of the items in categories, not codes. return self.values.itemsize - @property - def nbytes(self): - """ return the number of bytes in the underlying data """ - return self.values.nbytes - def get_values(self): """ return the underlying data as an ndarray """ return self._data.get_values() diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 907bbb2e8762e..94dbd8b884e47 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1319,7 +1319,7 @@ def from_tuples(cls, tuples, sortorder=None, names=None): arrays = [[]] * len(names) elif isinstance(tuples, (np.ndarray, Index)): if isinstance(tuples, Index): - tuples = tuples._ndarray_values + tuples = tuples._values arrays = list(lib.tuples_to_object_array(tuples).T) elif isinstance(tuples, list): diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index d590499faa65e..621641747f376 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1897,7 +1897,7 @@ def _format(x): vals = self.values if isinstance(vals, Index): - vals = vals._ndarray_values + vals = vals._values elif isinstance(vals, ABCSparseArray): vals = vals.values diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py index e9176e749564e..05678b0c8dd45 100644 --- a/pandas/tests/indexes/datetimes/test_datetime.py +++ b/pandas/tests/indexes/datetimes/test_datetime.py @@ -477,10 +477,4 @@ def test_factorize_dst(self): ]) def test_unique(self, arr, expected): result = arr.unique() - - if isinstance(expected, np.ndarray): - tm.assert_numpy_array_equal(result, expected) - if isinstance(expected, pd.Series): - tm.assert_series_equal(result, expected) - if isinstance(expected, pd.DatetimeIndex): - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected) From f8e29b918f7b4cc306ff7b18efa549e17aedbbe9 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Feb 2018 09:53:55 -0600 Subject: [PATCH 31/36] lint --- pandas/core/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 8081e20faaeb3..cf48b419b7df1 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -13,7 +13,6 @@ is_list_like, is_scalar, is_datetimelike, - is_categorical_dtype, is_extension_type, is_extension_array_dtype) From 0cd9faa5b42df01c96a8dddb7f7a73cea32d0a91 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Feb 2018 13:04:50 -0600 Subject: [PATCH 32/36] REF: Use _values for size and shape --- pandas/core/base.py | 4 ++-- pandas/core/indexes/datetimes.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index cf48b419b7df1..f6f1ba982e1d9 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -711,7 +711,7 @@ def transpose(self, *args, **kwargs): @property def shape(self): """ return a tuple of the shape of the underlying data """ - return self._ndarray_values.shape + return self._values.shape @property def ndim(self): @@ -754,7 +754,7 @@ def strides(self): @property def size(self): """ return the number of elements in the underlying data """ - return self._ndarray_values.size + return self._values.size @property def flags(self): diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 22ce690b3d420..689610af7603f 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -694,6 +694,20 @@ def tzinfo(self): """ return self.tz + @property + def size(self): + # TODO: Remove this when we have a DatetimeTZArray + # Necessary to avoid recursion error since DTI._values is a DTI + # for TZ-aware + return self._ndarray_values.size + + @property + def shape(self): + # TODO: Remove this when we have a DatetimeTZArray + # Necessary to avoid recursion error since DTI._values is a DTI + # for TZ-aware + return self._ndarray_values.shape + @cache_readonly def _timezone(self): """ Comparable timezone both for pytz / dateutil""" From 8fcdb7040345e1d0017367695354d9c858c71e09 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Feb 2018 13:09:13 -0600 Subject: [PATCH 33/36] PERF: Implement size, shape for IntervalIndex --- pandas/core/indexes/interval.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 3bf783b5a2faa..d431ea1e51e31 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -680,6 +680,16 @@ def length(self): 'e.g. Intervals with string endpoints') raise TypeError(msg) + @property + def size(self): + # Avoid materializing self.values + return self.left.size + + @property + def shape(self): + # Avoid materializing self.values + return self.left.shape + def __len__(self): return len(self.left) From 34a6a22e2255eb11e5c6b6c5478350fb84ce656e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Feb 2018 13:11:00 -0600 Subject: [PATCH 34/36] PERF: Avoid materializing values for PeriodIndex shape, size --- pandas/core/indexes/period.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index e90d3827fe84e..8f2d7d382a16e 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -477,6 +477,16 @@ def _to_embed(self, keep_tz=False, dtype=None): return self.astype(object).values + @property + def size(self): + # Avoid materializing self._values + return self._ndarray_values.size + + @property + def shape(self): + # Avoid materializing self._values + return self._ndarray_values.shape + @property def _formatter_func(self): return lambda x: "'%s'" % x From d6e8051d1ebab7cf99bd7ac23eea348d0e3a0d4c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Feb 2018 20:55:19 -0600 Subject: [PATCH 35/36] Cleanup --- pandas/core/base.py | 3 +-- pandas/core/indexes/base.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index f6f1ba982e1d9..0ca029ffd4c25 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -744,7 +744,7 @@ def itemsize(self): @property def nbytes(self): """ return the number of bytes in the underlying data """ - return self.values.nbytes + return self._values.nbytes @property def strides(self): @@ -988,7 +988,6 @@ def value_counts(self, normalize=False, sort=True, ascending=False, def unique(self): values = self._values - # TODO: Make unique part of the ExtensionArray interface. if hasattr(values, 'unique'): result = values.unique() diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a09a4c59a819a..be7c1624936bf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -599,7 +599,7 @@ def values(self): @property def _values(self): # type: () -> Union[ExtensionArray, Index] - # TODO: remove index types as they become is extension arrays + # TODO(EA): remove index types as they become extension arrays """The best array representation. This is an ndarray, ExtensionArray, or Index subclass. This differs @@ -2264,7 +2264,7 @@ def union(self, other): other = other.astype('O') return this.union(other) - # TODO: setops-refactor, clean all this up + # TODO(EA): setops-refactor, clean all this up if is_period_dtype(self) or is_datetime64tz_dtype(self): lvals = self._ndarray_values else: @@ -2357,7 +2357,7 @@ def intersection(self, other): other = other.astype('O') return this.intersection(other) - # TODO: setops-refactor, clean all this up + # TODO(EA): setops-refactor, clean all this up if is_period_dtype(self): lvals = self._ndarray_values else: From 3af8a21ea0e13ba5fc73db464f6e327552c71b0e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 13 Feb 2018 05:54:27 -0600 Subject: [PATCH 36/36] Override nbytes --- pandas/core/indexes/datetimes.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 689610af7603f..cc9ce1f3fd5eb 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -708,6 +708,13 @@ def shape(self): # for TZ-aware return self._ndarray_values.shape + @property + def nbytes(self): + # TODO: Remove this when we have a DatetimeTZArray + # Necessary to avoid recursion error since DTI._values is a DTI + # for TZ-aware + return self._ndarray_values.nbytes + @cache_readonly def _timezone(self): """ Comparable timezone both for pytz / dateutil"""