REF: Internal / External values (#19558)

* REF/Clean: Internal / External values * Move to index base * Cleanup unique handling * Simplify object concat * Use values for intersection I think eventually we'll want to ndarray_values for this, but it'll require a bit more work to support. Currently, using ndarary_values causes occasional failures on categorical. * hmm * Additional testing * More tests * ndarray_values * API: Default ExtensionArray.astype (cherry picked from commit 943a915562b72bed147c857de927afa0daf31c1a) (cherry picked from commit fbf0a06) * Simplify concat_as_object * Py2 compat (cherry picked from commit b20e12c) * Set-ops ugliness * better docstrings * tolist * linting * Moved dtypes (cherry picked from commit d136227) * clean * cleanup * NumPy compat * Use base _values for CategoricalIndex * Update dev docs * cleanup * Linting * Precision in tests * Push _ndarray_values to ExtensionArray Now IndexOpsMixin._ndarray_values will dispatch all the way down to the EA. Subclasses like Categorical can override it as they see fit. * Clean up tolist * Move test locations * Fixed test * REF: Update per comments * lint * REF: Use _values for size and shape * PERF: Implement size, shape for IntervalIndex * PERF: Avoid materializing values for PeriodIndex shape, size * Cleanup * Override nbytes
pandas-dev · Feb 13, 2018 · df38f66 · df38f66
1 parent d9551c8
commit df38f66
Show file tree

Hide file tree

Showing 25 changed files with 386 additions and 85 deletions.
diff --git a/doc/source/internals.rst b/doc/source/internals.rst
@@ -89,6 +89,25 @@ not check (or care) whether the levels themselves are sorted. Fortunately, the
 constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but
 if you compute the levels and labels yourself, please be careful.
 
+Values
+~~~~~~
+
+Pandas extends NumPy's type system with custom types, like ``Categorical`` or
+datetimes with a timezone, so we have multiple notions of "values". For 1-D
+containers (``Index`` classes and ``Series``) we have the following convention:
+
+* ``cls._ndarray_values`` is *always* a NumPy ``ndarray``. Ideally,
+  ``_ndarray_values`` is cheap to compute. For example, for a ``Categorical``,
+  this returns the codes, not the array of objects.
+* ``cls._values`` refers is the "best possible" array. This could be an
+  ``ndarray``, ``ExtensionArray``, or in ``Index`` subclass (note: we're in the
+  process of removing the index subclasses here so that it's always an
+  ``ndarray`` or ``ExtensionArray``).
+
+So, for example, ``Series[category]._values`` is a ``Categorical``, while
+``Series[category]._ndarray_values`` is the underlying codes.
+
+
 .. _ref-subclassing-pandas:
 
 Subclassing pandas Data Structures

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -266,3 +266,15 @@ def _can_hold_na(self):
         Setting this to false will optimize some operations like fillna.
         """
         return True
+
+    @property
+    def _ndarray_values(self):
+        # type: () -> np.ndarray
+        """Internal pandas method for lossy conversion to a NumPy ndarray.
+
+        This method is not part of the pandas interface.
+
+        The expectation is that this is cheap to compute, and is primarily
+        used for interacting with our indexers.
+        """
+        return np.array(self)
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -410,6 +410,10 @@ def dtype(self):
         """The :class:`~pandas.api.types.CategoricalDtype` for this instance"""
         return self._dtype
 
+    @property
+    def _ndarray_values(self):
+        return self.codes
+
     @property
     def _constructor(self):
         return Categorical

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -13,7 +13,8 @@
     is_list_like,
     is_scalar,
     is_datetimelike,
-    is_extension_type)
+    is_extension_type,
+    is_extension_array_dtype)
 
 from pandas.util._validators import validate_bool_kwarg
 
@@ -738,7 +739,7 @@ def data(self):
     @property
     def itemsize(self):
         """ return the size of the dtype of the item of the underlying data """
-        return self._values.itemsize
+        return self._ndarray_values.itemsize
 
     @property
     def nbytes(self):
@@ -748,7 +749,7 @@ def nbytes(self):
     @property
     def strides(self):
         """ return the strides of the underlying data """
-        return self._values.strides
+        return self._ndarray_values.strides
 
     @property
     def size(self):
@@ -768,8 +769,17 @@ def base(self):
         return self.values.base
 
     @property
-    def _values(self):
-        """ the internal implementation """
+    def _ndarray_values(self):
+        """The data as an ndarray, possibly losing information.
+
+        The expectation is that this is cheap to compute, and is primarily
+        used for interacting with our indexers.
+
+        - categorical -> codes
+        """
+        # type: () -> np.ndarray
+        if is_extension_array_dtype(self):
+            return self.values._ndarray_values
         return self.values
 
     @property
@@ -979,6 +989,7 @@ def unique(self):
         values = self._values
 
         if hasattr(values, 'unique'):
+
             result = values.unique()
         else:
             from pandas.core.algorithms import unique1d

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -927,7 +927,7 @@ def try_timedelta(v):
         # will try first with a string & object conversion
         from pandas import to_timedelta
         try:
-            return to_timedelta(v)._values.reshape(shape)
+            return to_timedelta(v)._ndarray_values.reshape(shape)
         except Exception:
             return v.reshape(shape)
 

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -1709,7 +1709,7 @@ def is_extension_array_dtype(arr_or_dtype):
     from pandas.core.arrays import ExtensionArray
 
     # we want to unpack series, anything else?
-    if isinstance(arr_or_dtype, ABCSeries):
+    if isinstance(arr_or_dtype, (ABCIndexClass, ABCSeries)):
         arr_or_dtype = arr_or_dtype._values
     return isinstance(arr_or_dtype, (ExtensionDtype, ExtensionArray))
 

diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -488,12 +488,14 @@ def _concat_index_asobject(to_concat, name=None):
     concat all inputs as object. DatetimeIndex, TimedeltaIndex and
     PeriodIndex are converted to object dtype before concatenation
     """
+    from pandas import Index
+    from pandas.core.arrays import ExtensionArray
 
-    klasses = ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex
+    klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex,
+               ExtensionArray)
     to_concat = [x.astype(object) if isinstance(x, klasses) else x
                  for x in to_concat]
 
-    from pandas import Index
     self = to_concat[0]
     attribs = self._get_attributes_dict()
     attribs['name'] = name

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -31,12 +31,14 @@
     is_object_dtype,
     is_categorical_dtype,
     is_interval_dtype,
+    is_period_dtype,
     is_bool,
     is_bool_dtype,
     is_signed_integer_dtype,
     is_unsigned_integer_dtype,
     is_integer_dtype, is_float_dtype,
     is_datetime64_any_dtype,
+    is_datetime64tz_dtype,
     is_timedelta64_dtype,
     needs_i8_conversion,
     is_iterator, is_list_like,
@@ -412,7 +414,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
                 values = np.array(values, copy=False)
                 if is_object_dtype(values):
                     values = cls(values, name=name, dtype=dtype,
-                                 **kwargs)._values
+                                 **kwargs)._ndarray_values
 
         result = object.__new__(cls)
         result._data = values
@@ -594,6 +596,40 @@ def values(self):
         """ return the underlying data as an ndarray """
         return self._data.view(np.ndarray)
 
+    @property
+    def _values(self):
+        # type: () -> Union[ExtensionArray, Index]
+        # TODO(EA): remove index types as they become extension arrays
+        """The best array representation.
+
+        This is an ndarray, ExtensionArray, or Index subclass. This differs
+        from ``_ndarray_values``, which always returns an ndarray.
+
+        Both ``_values`` and ``_ndarray_values`` are consistent between
+        ``Series`` and ``Index``.
+
+        It may differ from the public '.values' method.
+
+        index             | values          | _values     | _ndarray_values |
+        ----------------- | -------------- -| ----------- | --------------- |
+        CategoricalIndex  | Categorical     | Categorical | codes           |
+        DatetimeIndex[tz] | ndarray[M8ns]   | DTI[tz]     | ndarray[M8ns]   |
+
+        For the following, the ``._values`` is currently ``ndarray[object]``,
+        but will soon be an ``ExtensionArray``
+
+        index             | values          | _values      | _ndarray_values |
+        ----------------- | --------------- | ------------ | --------------- |
+        PeriodIndex       | ndarray[object] | ndarray[obj] | ndarray[int]    |
+        IntervalIndex     | ndarray[object] | ndarray[obj] | ndarray[object] |
+
+        See Also
+        --------
+        values
+        _ndarray_values
+        """
+        return self.values
+
     def get_values(self):
         """ return the underlying data as an ndarray """
         return self.values
@@ -664,7 +700,7 @@ def ravel(self, order='C'):
         --------
         numpy.ndarray.ravel
         """
-        return self._values.ravel(order=order)
+        return self._ndarray_values.ravel(order=order)
 
     # construction helpers
     @classmethod
@@ -1597,7 +1633,7 @@ def _constructor(self):
     @cache_readonly
     def _engine(self):
         # property, for now, slow to look up
-        return self._engine_type(lambda: self._values, len(self))
+        return self._engine_type(lambda: self._ndarray_values, len(self))
 
     def _validate_index_level(self, level):
         """
@@ -2228,27 +2264,37 @@ def union(self, other):
             other = other.astype('O')
             return this.union(other)
 
+        # TODO(EA): setops-refactor, clean all this up
+        if is_period_dtype(self) or is_datetime64tz_dtype(self):
+            lvals = self._ndarray_values
+        else:
+            lvals = self._values
+        if is_period_dtype(other) or is_datetime64tz_dtype(other):
+            rvals = other._ndarray_values
+        else:
+            rvals = other._values
+
         if self.is_monotonic and other.is_monotonic:
             try:
-                result = self._outer_indexer(self._values, other._values)[0]
+                result = self._outer_indexer(lvals, rvals)[0]
             except TypeError:
                 # incomparable objects
-                result = list(self._values)
+                result = list(lvals)
 
                 # worth making this faster? a very unusual case
-                value_set = set(self._values)
-                result.extend([x for x in other._values if x not in value_set])
+                value_set = set(lvals)
+                result.extend([x for x in rvals if x not in value_set])
         else:
             indexer = self.get_indexer(other)
             indexer, = (indexer == -1).nonzero()
 
             if len(indexer) > 0:
-                other_diff = algos.take_nd(other._values, indexer,
+                other_diff = algos.take_nd(rvals, indexer,
                                            allow_fill=False)
-                result = _concat._concat_compat((self._values, other_diff))
+                result = _concat._concat_compat((lvals, other_diff))
 
                 try:
-                    self._values[0] < other_diff[0]
+                    lvals[0] < other_diff[0]
                 except TypeError as e:
                     warnings.warn("%s, sort order is undefined for "
                                   "incomparable objects" % e, RuntimeWarning,
@@ -2260,7 +2306,7 @@ def union(self, other):
                         result.sort()
 
             else:
-                result = self._values
+                result = lvals
 
                 try:
                     result = np.sort(result)
@@ -2311,20 +2357,30 @@ def intersection(self, other):
             other = other.astype('O')
             return this.intersection(other)
 
+        # TODO(EA): setops-refactor, clean all this up
+        if is_period_dtype(self):
+            lvals = self._ndarray_values
+        else:
+            lvals = self._values
+        if is_period_dtype(other):
+            rvals = other._ndarray_values
+        else:
+            rvals = other._values
+
         if self.is_monotonic and other.is_monotonic:
             try:
-                result = self._inner_indexer(self._values, other._values)[0]
+                result = self._inner_indexer(lvals, rvals)[0]
                 return self._wrap_union_result(other, result)
             except TypeError:
                 pass
 
         try:
-            indexer = Index(other._values).get_indexer(self._values)
+            indexer = Index(rvals).get_indexer(lvals)
             indexer = indexer.take((indexer != -1).nonzero()[0])
         except Exception:
             # duplicates
             indexer = algos.unique1d(
-                Index(other._values).get_indexer_non_unique(self._values)[0])
+                Index(rvals).get_indexer_non_unique(lvals)[0])
             indexer = indexer[indexer != -1]
 
         taken = other.take(indexer)
@@ -2700,7 +2756,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 raise ValueError('limit argument only valid if doing pad, '
                                  'backfill or nearest reindexing')
 
-            indexer = self._engine.get_indexer(target._values)
+            indexer = self._engine.get_indexer(target._ndarray_values)
 
         return _ensure_platform_int(indexer)
 
@@ -2716,12 +2772,13 @@ def _get_fill_indexer(self, target, method, limit=None, tolerance=None):
         if self.is_monotonic_increasing and target.is_monotonic_increasing:
             method = (self._engine.get_pad_indexer if method == 'pad' else
                       self._engine.get_backfill_indexer)
-            indexer = method(target._values, limit)
+            indexer = method(target._ndarray_values, limit)
         else:
             indexer = self._get_fill_indexer_searchsorted(target, method,
                                                           limit)
         if tolerance is not None:
-            indexer = self._filter_indexer_tolerance(target._values, indexer,
+            indexer = self._filter_indexer_tolerance(target._ndarray_values,
+                                                     indexer,
                                                      tolerance)
         return indexer
 
@@ -2812,7 +2869,7 @@ def get_indexer_non_unique(self, target):
             self = Index(self.asi8)
             tgt_values = target.asi8
         else:
-            tgt_values = target._values
+            tgt_values = target._ndarray_values
 
         indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
         return _ensure_platform_int(indexer), missing
@@ -3247,16 +3304,17 @@ def _join_multi(self, other, how, return_indexers=True):
     def _join_non_unique(self, other, how='left', return_indexers=False):
         from pandas.core.reshape.merge import _get_join_indexers
 
-        left_idx, right_idx = _get_join_indexers([self._values],
-                                                 [other._values], how=how,
+        left_idx, right_idx = _get_join_indexers([self._ndarray_values],
+                                                 [other._ndarray_values],
+                                                 how=how,
                                                  sort=True)
 
         left_idx = _ensure_platform_int(left_idx)
         right_idx = _ensure_platform_int(right_idx)
 
-        join_index = np.asarray(self._values.take(left_idx))
+        join_index = np.asarray(self._ndarray_values.take(left_idx))
         mask = left_idx == -1
-        np.putmask(join_index, mask, other._values.take(right_idx))
+        np.putmask(join_index, mask, other._ndarray_values.take(right_idx))
 
         join_index = self._wrap_joined_index(join_index, other)
 
@@ -3403,8 +3461,8 @@ def _join_monotonic(self, other, how='left', return_indexers=False):
             else:
                 return ret_index
 
-        sv = self._values
-        ov = other._values
+        sv = self._ndarray_values
+        ov = other._ndarray_values
 
         if self.is_unique and other.is_unique:
             # We can perform much better than the general case
@@ -3756,7 +3814,7 @@ def insert(self, loc, item):
             item = self._na_value
 
         _self = np.asarray(self)
-        item = self._coerce_scalar_to_index(item)._values
+        item = self._coerce_scalar_to_index(item)._ndarray_values
         idx = np.concatenate((_self[:loc], item, _self[loc:]))
         return self._shallow_copy_with_infer(idx)