Skip to content

Commit

Permalink
REF: Internal / External values (#19558)
Browse files Browse the repository at this point in the history
* REF/Clean: Internal / External values

* Move to index base

* Cleanup unique handling

* Simplify object concat

* Use values for intersection

I think eventually we'll want to ndarray_values for this, but it'll
require a bit more work to support. Currently, using ndarary_values
causes occasional failures on categorical.

* hmm

* Additional testing

* More tests

* ndarray_values

* API: Default ExtensionArray.astype

(cherry picked from commit 943a915562b72bed147c857de927afa0daf31c1a)
(cherry picked from commit fbf0a06)

* Simplify concat_as_object

* Py2 compat

(cherry picked from commit b20e12c)

* Set-ops ugliness

* better docstrings

* tolist

* linting

* Moved dtypes

(cherry picked from commit d136227)

* clean

* cleanup

* NumPy compat

* Use base _values for CategoricalIndex

* Update dev docs

* cleanup

* Linting

* Precision in tests

* Push _ndarray_values to ExtensionArray

Now IndexOpsMixin._ndarray_values will dispatch all the way down to the EA.
Subclasses like Categorical can override it as they see fit.

* Clean up tolist

* Move test locations

* Fixed test

* REF: Update per comments

* lint

* REF: Use _values for size and shape

* PERF: Implement size, shape for IntervalIndex

* PERF: Avoid materializing values for PeriodIndex shape, size

* Cleanup

* Override nbytes
  • Loading branch information
TomAugspurger committed Feb 13, 2018
1 parent d9551c8 commit df38f66
Show file tree
Hide file tree
Showing 25 changed files with 386 additions and 85 deletions.
19 changes: 19 additions & 0 deletions doc/source/internals.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,25 @@ not check (or care) whether the levels themselves are sorted. Fortunately, the
constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but
if you compute the levels and labels yourself, please be careful.

Values
~~~~~~

Pandas extends NumPy's type system with custom types, like ``Categorical`` or
datetimes with a timezone, so we have multiple notions of "values". For 1-D
containers (``Index`` classes and ``Series``) we have the following convention:

* ``cls._ndarray_values`` is *always* a NumPy ``ndarray``. Ideally,
``_ndarray_values`` is cheap to compute. For example, for a ``Categorical``,
this returns the codes, not the array of objects.
* ``cls._values`` refers is the "best possible" array. This could be an
``ndarray``, ``ExtensionArray``, or in ``Index`` subclass (note: we're in the
process of removing the index subclasses here so that it's always an
``ndarray`` or ``ExtensionArray``).

So, for example, ``Series[category]._values`` is a ``Categorical``, while
``Series[category]._ndarray_values`` is the underlying codes.


.. _ref-subclassing-pandas:

Subclassing pandas Data Structures
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,3 +266,15 @@ def _can_hold_na(self):
Setting this to false will optimize some operations like fillna.
"""
return True

@property
def _ndarray_values(self):
# type: () -> np.ndarray
"""Internal pandas method for lossy conversion to a NumPy ndarray.
This method is not part of the pandas interface.
The expectation is that this is cheap to compute, and is primarily
used for interacting with our indexers.
"""
return np.array(self)
4 changes: 4 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,10 @@ def dtype(self):
"""The :class:`~pandas.api.types.CategoricalDtype` for this instance"""
return self._dtype

@property
def _ndarray_values(self):
return self.codes

@property
def _constructor(self):
return Categorical
Expand Down
21 changes: 16 additions & 5 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
is_list_like,
is_scalar,
is_datetimelike,
is_extension_type)
is_extension_type,
is_extension_array_dtype)

from pandas.util._validators import validate_bool_kwarg

Expand Down Expand Up @@ -738,7 +739,7 @@ def data(self):
@property
def itemsize(self):
""" return the size of the dtype of the item of the underlying data """
return self._values.itemsize
return self._ndarray_values.itemsize

@property
def nbytes(self):
Expand All @@ -748,7 +749,7 @@ def nbytes(self):
@property
def strides(self):
""" return the strides of the underlying data """
return self._values.strides
return self._ndarray_values.strides

@property
def size(self):
Expand All @@ -768,8 +769,17 @@ def base(self):
return self.values.base

@property
def _values(self):
""" the internal implementation """
def _ndarray_values(self):
"""The data as an ndarray, possibly losing information.
The expectation is that this is cheap to compute, and is primarily
used for interacting with our indexers.
- categorical -> codes
"""
# type: () -> np.ndarray
if is_extension_array_dtype(self):
return self.values._ndarray_values
return self.values

@property
Expand Down Expand Up @@ -979,6 +989,7 @@ def unique(self):
values = self._values

if hasattr(values, 'unique'):

result = values.unique()
else:
from pandas.core.algorithms import unique1d
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -927,7 +927,7 @@ def try_timedelta(v):
# will try first with a string & object conversion
from pandas import to_timedelta
try:
return to_timedelta(v)._values.reshape(shape)
return to_timedelta(v)._ndarray_values.reshape(shape)
except Exception:
return v.reshape(shape)

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1709,7 +1709,7 @@ def is_extension_array_dtype(arr_or_dtype):
from pandas.core.arrays import ExtensionArray

# we want to unpack series, anything else?
if isinstance(arr_or_dtype, ABCSeries):
if isinstance(arr_or_dtype, (ABCIndexClass, ABCSeries)):
arr_or_dtype = arr_or_dtype._values
return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray))

Expand Down
6 changes: 4 additions & 2 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,12 +488,14 @@ def _concat_index_asobject(to_concat, name=None):
concat all inputs as object. DatetimeIndex, TimedeltaIndex and
PeriodIndex are converted to object dtype before concatenation
"""
from pandas import Index
from pandas.core.arrays import ExtensionArray

klasses = ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex
klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex,
ExtensionArray)
to_concat = [x.astype(object) if isinstance(x, klasses) else x
for x in to_concat]

from pandas import Index
self = to_concat[0]
attribs = self._get_attributes_dict()
attribs['name'] = name
Expand Down
108 changes: 83 additions & 25 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,14 @@
is_object_dtype,
is_categorical_dtype,
is_interval_dtype,
is_period_dtype,
is_bool,
is_bool_dtype,
is_signed_integer_dtype,
is_unsigned_integer_dtype,
is_integer_dtype, is_float_dtype,
is_datetime64_any_dtype,
is_datetime64tz_dtype,
is_timedelta64_dtype,
needs_i8_conversion,
is_iterator, is_list_like,
Expand Down Expand Up @@ -412,7 +414,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
values = np.array(values, copy=False)
if is_object_dtype(values):
values = cls(values, name=name, dtype=dtype,
**kwargs)._values
**kwargs)._ndarray_values

result = object.__new__(cls)
result._data = values
Expand Down Expand Up @@ -594,6 +596,40 @@ def values(self):
""" return the underlying data as an ndarray """
return self._data.view(np.ndarray)

@property
def _values(self):
# type: () -> Union[ExtensionArray, Index]
# TODO(EA): remove index types as they become extension arrays
"""The best array representation.
This is an ndarray, ExtensionArray, or Index subclass. This differs
from ``_ndarray_values``, which always returns an ndarray.
Both ``_values`` and ``_ndarray_values`` are consistent between
``Series`` and ``Index``.
It may differ from the public '.values' method.
index | values | _values | _ndarray_values |
----------------- | -------------- -| ----------- | --------------- |
CategoricalIndex | Categorical | Categorical | codes |
DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] |
For the following, the ``._values`` is currently ``ndarray[object]``,
but will soon be an ``ExtensionArray``
index | values | _values | _ndarray_values |
----------------- | --------------- | ------------ | --------------- |
PeriodIndex | ndarray[object] | ndarray[obj] | ndarray[int] |
IntervalIndex | ndarray[object] | ndarray[obj] | ndarray[object] |
See Also
--------
values
_ndarray_values
"""
return self.values

def get_values(self):
""" return the underlying data as an ndarray """
return self.values
Expand Down Expand Up @@ -664,7 +700,7 @@ def ravel(self, order='C'):
--------
numpy.ndarray.ravel
"""
return self._values.ravel(order=order)
return self._ndarray_values.ravel(order=order)

# construction helpers
@classmethod
Expand Down Expand Up @@ -1597,7 +1633,7 @@ def _constructor(self):
@cache_readonly
def _engine(self):
# property, for now, slow to look up
return self._engine_type(lambda: self._values, len(self))
return self._engine_type(lambda: self._ndarray_values, len(self))

def _validate_index_level(self, level):
"""
Expand Down Expand Up @@ -2228,27 +2264,37 @@ def union(self, other):
other = other.astype('O')
return this.union(other)

# TODO(EA): setops-refactor, clean all this up
if is_period_dtype(self) or is_datetime64tz_dtype(self):
lvals = self._ndarray_values
else:
lvals = self._values
if is_period_dtype(other) or is_datetime64tz_dtype(other):
rvals = other._ndarray_values
else:
rvals = other._values

if self.is_monotonic and other.is_monotonic:
try:
result = self._outer_indexer(self._values, other._values)[0]
result = self._outer_indexer(lvals, rvals)[0]
except TypeError:
# incomparable objects
result = list(self._values)
result = list(lvals)

# worth making this faster? a very unusual case
value_set = set(self._values)
result.extend([x for x in other._values if x not in value_set])
value_set = set(lvals)
result.extend([x for x in rvals if x not in value_set])
else:
indexer = self.get_indexer(other)
indexer, = (indexer == -1).nonzero()

if len(indexer) > 0:
other_diff = algos.take_nd(other._values, indexer,
other_diff = algos.take_nd(rvals, indexer,
allow_fill=False)
result = _concat._concat_compat((self._values, other_diff))
result = _concat._concat_compat((lvals, other_diff))

try:
self._values[0] < other_diff[0]
lvals[0] < other_diff[0]
except TypeError as e:
warnings.warn("%s, sort order is undefined for "
"incomparable objects" % e, RuntimeWarning,
Expand All @@ -2260,7 +2306,7 @@ def union(self, other):
result.sort()

else:
result = self._values
result = lvals

try:
result = np.sort(result)
Expand Down Expand Up @@ -2311,20 +2357,30 @@ def intersection(self, other):
other = other.astype('O')
return this.intersection(other)

# TODO(EA): setops-refactor, clean all this up
if is_period_dtype(self):
lvals = self._ndarray_values
else:
lvals = self._values
if is_period_dtype(other):
rvals = other._ndarray_values
else:
rvals = other._values

if self.is_monotonic and other.is_monotonic:
try:
result = self._inner_indexer(self._values, other._values)[0]
result = self._inner_indexer(lvals, rvals)[0]
return self._wrap_union_result(other, result)
except TypeError:
pass

try:
indexer = Index(other._values).get_indexer(self._values)
indexer = Index(rvals).get_indexer(lvals)
indexer = indexer.take((indexer != -1).nonzero()[0])
except Exception:
# duplicates
indexer = algos.unique1d(
Index(other._values).get_indexer_non_unique(self._values)[0])
Index(rvals).get_indexer_non_unique(lvals)[0])
indexer = indexer[indexer != -1]

taken = other.take(indexer)
Expand Down Expand Up @@ -2700,7 +2756,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
raise ValueError('limit argument only valid if doing pad, '
'backfill or nearest reindexing')

indexer = self._engine.get_indexer(target._values)
indexer = self._engine.get_indexer(target._ndarray_values)

return _ensure_platform_int(indexer)

Expand All @@ -2716,12 +2772,13 @@ def _get_fill_indexer(self, target, method, limit=None, tolerance=None):
if self.is_monotonic_increasing and target.is_monotonic_increasing:
method = (self._engine.get_pad_indexer if method == 'pad' else
self._engine.get_backfill_indexer)
indexer = method(target._values, limit)
indexer = method(target._ndarray_values, limit)
else:
indexer = self._get_fill_indexer_searchsorted(target, method,
limit)
if tolerance is not None:
indexer = self._filter_indexer_tolerance(target._values, indexer,
indexer = self._filter_indexer_tolerance(target._ndarray_values,
indexer,
tolerance)
return indexer

Expand Down Expand Up @@ -2812,7 +2869,7 @@ def get_indexer_non_unique(self, target):
self = Index(self.asi8)
tgt_values = target.asi8
else:
tgt_values = target._values
tgt_values = target._ndarray_values

indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
return _ensure_platform_int(indexer), missing
Expand Down Expand Up @@ -3247,16 +3304,17 @@ def _join_multi(self, other, how, return_indexers=True):
def _join_non_unique(self, other, how='left', return_indexers=False):
from pandas.core.reshape.merge import _get_join_indexers

left_idx, right_idx = _get_join_indexers([self._values],
[other._values], how=how,
left_idx, right_idx = _get_join_indexers([self._ndarray_values],
[other._ndarray_values],
how=how,
sort=True)

left_idx = _ensure_platform_int(left_idx)
right_idx = _ensure_platform_int(right_idx)

join_index = np.asarray(self._values.take(left_idx))
join_index = np.asarray(self._ndarray_values.take(left_idx))
mask = left_idx == -1
np.putmask(join_index, mask, other._values.take(right_idx))
np.putmask(join_index, mask, other._ndarray_values.take(right_idx))

join_index = self._wrap_joined_index(join_index, other)

Expand Down Expand Up @@ -3403,8 +3461,8 @@ def _join_monotonic(self, other, how='left', return_indexers=False):
else:
return ret_index

sv = self._values
ov = other._values
sv = self._ndarray_values
ov = other._ndarray_values

if self.is_unique and other.is_unique:
# We can perform much better than the general case
Expand Down Expand Up @@ -3756,7 +3814,7 @@ def insert(self, loc, item):
item = self._na_value

_self = np.asarray(self)
item = self._coerce_scalar_to_index(item)._values
item = self._coerce_scalar_to_index(item)._ndarray_values
idx = np.concatenate((_self[:loc], item, _self[loc:]))
return self._shallow_copy_with_infer(idx)

Expand Down
Loading

0 comments on commit df38f66

Please sign in to comment.