pandas-dev · TomAugspurger · Feb 13, 2018 · Feb 3, 2018 · Feb 6, 2018 · Feb 7, 2018
diff --git a/doc/source/internals.rst b/doc/source/internals.rst
@@ -89,6 +89,21 @@ not check (or care) whether the levels themselves are sorted. Fortunately, the
 constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but
 if you compute the levels and labels yourself, please be careful.
 
+Values
+~~~~~~
+
+Pandas extends NumPy's type system in a few places, so we have multiple notions of "values" floating around.
+For 1-D containers (``Index`` classes and ``Series``) we have the following convention:
+
+* ``cls._ndarray_values`` is *always* and ``ndarray``
+* ``cls._values`` refers is the "best possible" array. This could be an ``ndarray``, ``ExtensionArray``, or
+  in ``Index`` subclass (note: we're in the process of removing the index subclasses here so that it's
+  always an ``ndarray`` or ``ExtensionArray``).
+
+So, for example, ``Series[category]._values`` is a ``Categorical``, while ``Series[category]._ndarray_values`` is
+the underlying ndarray.
+
+
 .. _ref-subclassing-pandas:
 
 Subclassing pandas Data Structures

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -7,7 +7,8 @@
 import numpy as np
 
 from pandas.core.dtypes.missing import isna
-from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass
+from pandas.core.dtypes.generic import (
+    ABCDataFrame, ABCSeries, ABCIndexClass, ABCDatetimeIndex)
 from pandas.core.dtypes.common import (
     is_object_dtype,
     is_list_like,
@@ -706,7 +707,7 @@ def transpose(self, *args, **kwargs):
     @property
     def shape(self):
         """ return a tuple of the shape of the underlying data """
-        return self._values.shape
+        return self._ndarray_values.shape
 
     @property
     def ndim(self):
@@ -734,22 +735,22 @@ def data(self):
     @property
     def itemsize(self):
         """ return the size of the dtype of the item of the underlying data """
-        return self._values.itemsize
+        return self._ndarray_values.itemsize
 
     @property
     def nbytes(self):
         """ return the number of bytes in the underlying data """
-        return self._values.nbytes
+        return self._ndarray_values.nbytes
 
     @property
     def strides(self):
         """ return the strides of the underlying data """
-        return self._values.strides
+        return self._ndarray_values.strides
 
     @property
     def size(self):
         """ return the number of elements in the underlying data """
-        return self._values.size
+        return self._ndarray_values.size
 
     @property
     def flags(self):
@@ -763,9 +764,34 @@ def base(self):
         """
         return self.values.base
 
+    @property
+    def _ndarray_values(self):
+        """The data as an ndarray. See '_values' for more."""
+        # type: () -> np.ndarray
+        return self.values
+
     @property
     def _values(self):
-        """ the internal implementation """
+        # type: () -> Union[ExtensionArray, Index]
+        # TODO: remove index types as they become is extension arrays
+        """ The best array representation.
+
+        This is an ndarray, ExtensionArray, or Index subclass. This differs
+        from '._ndarray_values', which always returns an ndarray. It may differ
+        from the public '.values'
+
+        index             | values          | _values
+        ----------------- | -------------- -| ----------
+        CategoricalIndex  | Categorical     | Categorical
+        DatetimeIndex[tz] | ndarray[M8ns]   | DTI[tz]
+        PeriodIndex       | ndarray[Period] | ndarray[Pd] (soon PeriodArray)
+        IntervalIndex     | ndarray[IV]     | ndarray[IV] (soon IntervalArray)
+
+        See Also
+        --------
+        values
+        _ndarray_values
+        """
         return self.values
 
     @property
@@ -816,7 +842,7 @@ def tolist(self):
         if is_datetimelike(self):
             return [com._maybe_box_datetimelike(x) for x in self._values]
         else:
-            return self._values.tolist()
+            return self._ndarray_values.tolist()
 
     def __iter__(self):
         """
@@ -973,8 +999,12 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
     @Appender(_shared_docs['unique'] % _indexops_doc_kwargs)
     def unique(self):
         values = self._values
-
+        if isinstance(values, ABCDatetimeIndex):
+            values = values._ndarray_values
+        # TODO: Make unique part of the ExtensionArray interface.
+        # else, this could be surprising.
         if hasattr(values, 'unique'):
+
             result = values.unique()
         else:
             from pandas.core.algorithms import unique1d

diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -480,7 +480,7 @@ def _concat_datetimetz(to_concat, name=None):
 
 def _concat_index_same_dtype(indexes, klass=None):
     klass = klass if klass is not None else indexes[0].__class__
-    return klass(np.concatenate([x._values for x in indexes]))
+    return klass(np.concatenate([x._ndarray_values for x in indexes]))
 
 
 def _concat_index_asobject(to_concat, name=None):
@@ -498,9 +498,16 @@ def _concat_index_asobject(to_concat, name=None):
     attribs = self._get_attributes_dict()
     attribs['name'] = name
 
-    to_concat = [x._values if isinstance(x, Index) else x
-                 for x in to_concat]
-    return self._shallow_copy_with_infer(np.concatenate(to_concat), **attribs)
+    arrays = []
+    for x in to_concat:
+        if is_categorical_dtype(x):
+            arrays.append(np.asarray(x, dtype=object))
+        elif isinstance(x, Index):
+            arrays.append(x._values)
+        else:
+            arrays.append(x)
+
+    return self._shallow_copy_with_infer(np.concatenate(arrays), **attribs)
 
 
 def _concat_sparse(to_concat, axis=0, typs=None):

diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -392,7 +392,7 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs):
                 values = np.array(values, copy=False)
                 if is_object_dtype(values):
                     values = cls(values, name=name, dtype=dtype,
-                                 **kwargs)._values
+                                 **kwargs)._ndarray_values
 
         result = object.__new__(cls)
         result._data = values
@@ -644,7 +644,7 @@ def ravel(self, order='C'):
         --------
         numpy.ndarray.ravel
         """
-        return self._values.ravel(order=order)
+        return self._ndarray_values.ravel(order=order)
 
     # construction helpers
     @classmethod
@@ -1577,7 +1577,7 @@ def _constructor(self):
     @cache_readonly
     def _engine(self):
         # property, for now, slow to look up
-        return self._engine_type(lambda: self._values, len(self))
+        return self._engine_type(lambda: self._ndarray_values, len(self))
 
     def _validate_index_level(self, level):
         """
@@ -2208,27 +2208,37 @@ def union(self, other):
             other = other.astype('O')
             return this.union(other)
 
+        if is_categorical_dtype(self):
+            lvals = self.values
+        else:
+            lvals = self._ndarray_values
+
+        if is_categorical_dtype(other):
+            rvals = other.values
+        else:
+            rvals = other._ndarray_values
+
         if self.is_monotonic and other.is_monotonic:
             try:
-                result = self._outer_indexer(self._values, other._values)[0]
+                result = self._outer_indexer(lvals, rvals)[0]
             except TypeError:
                 # incomparable objects
-                result = list(self._values)
+                result = list(lvals)
 
                 # worth making this faster? a very unusual case
-                value_set = set(self._values)
-                result.extend([x for x in other._values if x not in value_set])
+                value_set = set(lvals)
+                result.extend([x for x in rvals if x not in value_set])
         else:
             indexer = self.get_indexer(other)
             indexer, = (indexer == -1).nonzero()
 
             if len(indexer) > 0:
-                other_diff = algos.take_nd(other._values, indexer,
+                other_diff = algos.take_nd(rvals, indexer,
                                            allow_fill=False)
-                result = _concat._concat_compat((self._values, other_diff))
+                result = _concat._concat_compat((lvals, other_diff))
 
                 try:
-                    self._values[0] < other_diff[0]
+                    lvals[0] < other_diff[0]
                 except TypeError as e:
                     warnings.warn("%s, sort order is undefined for "
                                   "incomparable objects" % e, RuntimeWarning,
@@ -2240,7 +2250,7 @@ def union(self, other):
                         result.sort()
 
             else:
-                result = self._values
+                result = lvals
 
                 try:
                     result = np.sort(result)
@@ -2293,18 +2303,21 @@ def intersection(self, other):
 
         if self.is_monotonic and other.is_monotonic:
             try:
-                result = self._inner_indexer(self._values, other._values)[0]
+                result = self._inner_indexer(self._ndarray_values,
+                                             other._ndarray_values)[0]
                 return self._wrap_union_result(other, result)
             except TypeError:
                 pass
 
         try:
-            indexer = Index(other._values).get_indexer(self._values)
+            indexer = Index(other._ndarray_values).get_indexer(
+                self._ndarray_values)
             indexer = indexer.take((indexer != -1).nonzero()[0])
         except Exception:
             # duplicates
             indexer = algos.unique1d(
-                Index(other._values).get_indexer_non_unique(self._values)[0])
+                Index(other._ndarray_values).get_indexer_non_unique(
+                    self._ndarray_values)[0])
             indexer = indexer[indexer != -1]
 
         taken = other.take(indexer)
@@ -2680,7 +2693,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 raise ValueError('limit argument only valid if doing pad, '
                                  'backfill or nearest reindexing')
 
-            indexer = self._engine.get_indexer(target._values)
+            indexer = self._engine.get_indexer(target._ndarray_values)
 
         return _ensure_platform_int(indexer)
 
@@ -2696,12 +2709,13 @@ def _get_fill_indexer(self, target, method, limit=None, tolerance=None):
         if self.is_monotonic_increasing and target.is_monotonic_increasing:
             method = (self._engine.get_pad_indexer if method == 'pad' else
                       self._engine.get_backfill_indexer)
-            indexer = method(target._values, limit)
+            indexer = method(target._ndarray_values, limit)
         else:
             indexer = self._get_fill_indexer_searchsorted(target, method,
                                                           limit)
         if tolerance is not None:
-            indexer = self._filter_indexer_tolerance(target._values, indexer,
+            indexer = self._filter_indexer_tolerance(target._ndarray_values,
+                                                     indexer,
                                                      tolerance)
         return indexer
 
@@ -2792,7 +2806,7 @@ def get_indexer_non_unique(self, target):
             self = Index(self.asi8)
             tgt_values = target.asi8
         else:
-            tgt_values = target._values
+            tgt_values = target._ndarray_values
 
         indexer, missing = self._engine.get_indexer_non_unique(tgt_values)
         return _ensure_platform_int(indexer), missing
@@ -3227,16 +3241,17 @@ def _join_multi(self, other, how, return_indexers=True):
     def _join_non_unique(self, other, how='left', return_indexers=False):
         from pandas.core.reshape.merge import _get_join_indexers
 
-        left_idx, right_idx = _get_join_indexers([self._values],
-                                                 [other._values], how=how,
+        left_idx, right_idx = _get_join_indexers([self._ndarray_values],
+                                                 [other._ndarray_values],
+                                                 how=how,
                                                  sort=True)
 
         left_idx = _ensure_platform_int(left_idx)
         right_idx = _ensure_platform_int(right_idx)
 
-        join_index = np.asarray(self._values.take(left_idx))
+        join_index = np.asarray(self._ndarray_values.take(left_idx))
         mask = left_idx == -1
-        np.putmask(join_index, mask, other._values.take(right_idx))
+        np.putmask(join_index, mask, other._ndarray_values.take(right_idx))
 
         join_index = self._wrap_joined_index(join_index, other)
 
@@ -3383,8 +3398,8 @@ def _join_monotonic(self, other, how='left', return_indexers=False):
             else:
                 return ret_index
 
-        sv = self._values
-        ov = other._values
+        sv = self._ndarray_values
+        ov = other._ndarray_values
 
         if self.is_unique and other.is_unique:
             # We can perform much better than the general case
@@ -3736,7 +3751,7 @@ def insert(self, loc, item):
             item = self._na_value
 
         _self = np.asarray(self)
-        item = self._coerce_scalar_to_index(item)._values
+        item = self._coerce_scalar_to_index(item)._ndarray_values
         idx = np.concatenate((_self[:loc], item, _self[loc:]))
         return self._shallow_copy_with_infer(idx)
 

diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -227,7 +227,7 @@ def _is_dtype_compat(self, other):
         """
         if is_categorical_dtype(other):
             if isinstance(other, CategoricalIndex):
-                other = other._values
+                other = other.values
             if not other.is_dtype_equal(self):
                 raise TypeError("categories must match existing categories "
                                 "when appending")
@@ -293,6 +293,23 @@ def values(self):
         """ return the underlying data, which is a Categorical """
         return self._data
 
+    @property
+    def _values(self):
+        return self._data
+
+    @property
+    def _ndarray_values(self):
+        return self._data.codes
+
+    @property
+    def itemsize(self):
+        return self.values.itemsize
+
+    @property
+    def nbytes(self):
+        """ return the number of bytes in the underlying data """
+        return self.values.nbytes
+
     def get_values(self):
         """ return the underlying data as an ndarray """
         return self._data.get_values()
@@ -386,8 +403,8 @@ def is_monotonic_decreasing(self):
     def unique(self, level=None):
         if level is not None:
             self._validate_index_level(level)
-        result = base.IndexOpsMixin.unique(self)
-        # CategoricalIndex._shallow_copy uses keeps original categories
+        result = self.values.unique()
+        # CategoricalIndex._shallow_copy keeps original categories
         # and ordered if not otherwise specified
         return self._shallow_copy(result, categories=result.categories,
                                   ordered=result.ordered)
@@ -762,7 +779,7 @@ def _evaluate_compare(self, other):
 
     def _delegate_method(self, name, *args, **kwargs):
         """ method delegation to the ._values """
-        method = getattr(self._values, name)
+        method = getattr(self.values, name)
         if 'inplace' in kwargs:
             raise ValueError("cannot use inplace with CategoricalIndex")
         res = method(*args, **kwargs)

diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py
@@ -389,7 +389,7 @@ def sort_values(self, return_indexer=False, ascending=True):
             sorted_index = self.take(_as)
             return sorted_index, _as
         else:
-            sorted_values = np.sort(self._values)
+            sorted_values = np.sort(self._ndarray_values)
             attribs = self._get_attributes_dict()
             freq = attribs['freq']