pandas-dev · TomAugspurger · Feb 13, 2018 · Feb 3, 2018 · Feb 6, 2018 · Feb 7, 2018
diff --git a/doc/source/internals.rst b/doc/source/internals.rst
@@ -89,6 +89,21 @@ not check (or care) whether the levels themselves are sorted. Fortunately, the
 constructors ``from_tuples`` and ``from_arrays`` ensure that this is true, but
 if you compute the levels and labels yourself, please be careful.
 
+Values
+~~~~~~
+
+Pandas extends NumPy's type system in a few places, so we have multiple notions of "values" floating around.
+For 1-D containers (``Index`` classes and ``Series``) we have the following convention:
+
+* ``cls._ndarray_values`` is *always* and ``ndarray``
+* ``cls._values`` refers is the "best possible" array. This could be an ``ndarray``, ``ExtensionArray``, or
+  in ``Index`` subclass (note: we're in the process of removing the index subclasses here so that it's
+  always an ``ndarray`` or ``ExtensionArray``).
+
+So, for example, ``Series[category]._values`` is a ``Categorical``, while ``Series[category]._ndarray_values`` is
+the underlying ndarray.
+
+
 .. _ref-subclassing-pandas:
 
 Subclassing pandas Data Structures

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1,4 +1,6 @@
 """An interface for extending pandas with custom arrays."""
+import numpy as np
+
 from pandas.errors import AbstractMethodError
 
 _not_implemented_message = "{} does not implement {}."
@@ -138,6 +140,34 @@ def nbytes(self):
     # ------------------------------------------------------------------------
     # Additional Methods
     # ------------------------------------------------------------------------
+    def astype(self, dtype, copy=True):
+        """Cast to a NumPy array with 'dtype'.
+
+        The default implementation only allows casting to 'object' dtype.
+
+        Parameters
+        ----------
+        dtype : str or dtype
+            Typecode or data-type to which the array is cast.
+        copy : bool, default True
+            Whether to copy the data, even if not necessary. If False,
+            a copy is made only if the old dtype does not match the
+            new dtype.
+
+        Returns
+        -------
+        array : ndarray
+            NumPy ndarray with 'dtype' for its dtype.
+        """
+        np_dtype = np.dtype(dtype)
+
+        if np_dtype != 'object':
+            msg = ("{} can only be coerced to 'object' dtype, "
+                   "not '{}'.").format(type(self).__name__, dtype)
+            raise ValueError(msg)
+
+        return np.array(self, dtype=np_dtype, copy=copy)
+
     def isna(self):
         # type: () -> np.ndarray
         """Boolean NumPy array indicating if each value is missing.

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -7,12 +7,14 @@
 import numpy as np
 
 from pandas.core.dtypes.missing import isna
-from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries, ABCIndexClass
+from pandas.core.dtypes.generic import (
+    ABCDataFrame, ABCSeries, ABCIndexClass, ABCDatetimeIndex)
 from pandas.core.dtypes.common import (
     is_object_dtype,
     is_list_like,
     is_scalar,
     is_datetimelike,
+    is_categorical_dtype,
     is_extension_type)
 
 from pandas.util._validators import validate_bool_kwarg
@@ -710,7 +712,7 @@ def transpose(self, *args, **kwargs):
     @property
     def shape(self):
         """ return a tuple of the shape of the underlying data """
-        return self._values.shape
+        return self._ndarray_values.shape
 
     @property
     def ndim(self):
@@ -738,22 +740,22 @@ def data(self):
     @property
     def itemsize(self):
         """ return the size of the dtype of the item of the underlying data """
-        return self._values.itemsize
+        return self._ndarray_values.itemsize
 
     @property
     def nbytes(self):
         """ return the number of bytes in the underlying data """
-        return self._values.nbytes
+        return self._ndarray_values.nbytes
 
     @property
     def strides(self):
         """ return the strides of the underlying data """
-        return self._values.strides
+        return self._ndarray_values.strides
 
     @property
     def size(self):
         """ return the number of elements in the underlying data """
-        return self._values.size
+        return self._ndarray_values.size
 
     @property
     def flags(self):
@@ -768,8 +770,21 @@ def base(self):
         return self.values.base
 
     @property
-    def _values(self):
-        """ the internal implementation """
+    def _ndarray_values(self):
+        """The data as an ndarray, possibly losing information.
+
+        The expectation is that this is cheap to compute.
+
+        - categorical -> codes
+
+        See '_values' for more.
+        """
+        # type: () -> np.ndarray
+        from pandas.core.dtypes.common import is_categorical_dtype
+
+        if is_categorical_dtype(self):
+            return self._values.codes
+
         return self.values
 
     @property
@@ -819,8 +834,10 @@ def tolist(self):
 
         if is_datetimelike(self):
             return [com._maybe_box_datetimelike(x) for x in self._values]
+        elif is_categorical_dtype(self):
+            return self.values.tolist()
         else:
-            return self._values.tolist()
+            return self._ndarray_values.tolist()
 
     def __iter__(self):
         """
@@ -978,7 +995,9 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
     def unique(self):
         values = self._values
 
+        # TODO: Make unique part of the ExtensionArray interface.
         if hasattr(values, 'unique'):
+
             result = values.unique()
         else:
             from pandas.core.algorithms import unique1d

diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -480,20 +480,22 @@ def _concat_datetimetz(to_concat, name=None):
 
 def _concat_index_same_dtype(indexes, klass=None):
     klass = klass if klass is not None else indexes[0].__class__
-    return klass(np.concatenate([x._values for x in indexes]))
+    return klass(np.concatenate([x._ndarray_values for x in indexes]))
 
 
 def _concat_index_asobject(to_concat, name=None):
     """
     concat all inputs as object. DatetimeIndex, TimedeltaIndex and
     PeriodIndex are converted to object dtype before concatenation
     """
+    from pandas import Index
+    from pandas.core.arrays import ExtensionArray
 
-    klasses = ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex
+    klasses = (ABCDatetimeIndex, ABCTimedeltaIndex, ABCPeriodIndex,
+               ExtensionArray)
     to_concat = [x.astype(object) if isinstance(x, klasses) else x
                  for x in to_concat]
 
-    from pandas import Index
     self = to_concat[0]
     attribs = self._get_attributes_dict()
     attribs['name'] = name