Updates

1. Reversed order of take keywords 2. Added to extensions API 3. Removed default implementation
TomAugspurger · Apr 25, 2018 · 69e7fe7 · 69e7fe7
1 parent 05d8844
commit 69e7fe7
Show file tree

Hide file tree

Showing 12 changed files with 147 additions and 104 deletions.
diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py
@@ -2,5 +2,6 @@
 from pandas.core.accessor import (register_dataframe_accessor,  # noqa
                                   register_index_accessor,
                                   register_series_accessor)
+from pandas.core.algorithms import take  # noqa
 from pandas.core.arrays.base import ExtensionArray  # noqa
 from pandas.core.dtypes.dtypes import ExtensionDtype  # noqa
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1448,24 +1448,67 @@ def func(arr, indexer, out, fill_value=np.nan):
     return func
 
 
-def take(arr, indexer, fill_value=None, allow_fill=None):
+def take(arr, indexer, allow_fill=False, fill_value=None):
+    """Take elements from an array.
+
+    Parameters
+    ----------
+    arr : ndarray or ExtensionArray
+    indexer : sequence of integers
+        Indices to be taken. See Notes for how negative indicies
+        are handled.
+    allow_fill : bool, default False
+        How to handle negative values in `indexer`.
+
+        For False values (the default), negative values in `indexer`
+        indiciate slices from the right.
+
+        For True values, indicies where `indexer` is ``-1`` indicate
+        missing values. These values are set to `fill_value`. Any other
+        other negative value should raise a ``ValueError``.
+    fill_value : any, optional
+        Fill value to use for NA-indicies when `allow_fill` is True.
+        This may be ``None``, in which case the default NA value for
+        the type, ``self.dtype.na_value``, is used.
+
+    Returns
+    -------
+    ndarray or ExtensionArray
+        Same type as the input.
+
+    Raises
+    ------
+    IndexError
+        When the indexer is out of bounds for the array.
+    ValueError
+        When the indexer contains negative values other than ``-1``
+        and `allow_fill` is True.
+
+    See Also
+    --------
+    numpy.take
+    """
     indexer = np.asarray(indexer)
 
-    if allow_fill is None:
-        # NumPy style
-        result = arr.take(indexer)
-    else:
+    if allow_fill:
+        # Pandas style, -1 means NA
         # bounds checking
         if (indexer < -1).any():
             raise ValueError("Invalid value in 'indexer'. All values "
                              "must be non-negative or -1. When "
                              "'fill_value' is specified.")
+        if (indexer > len(arr)).any():
+            # TODO: reuse with logic elsewhere.
+            raise IndexError
 
         # # take on empty array not handled as desired by numpy
         # # in case of -1 (all missing take)
         # if not len(arr) and mask.all():
         #     return arr._from_sequence([fill_value] * len(indexer))
         result = take_1d(arr, indexer, fill_value=fill_value)
+    else:
+        # NumPy style
+        result = arr.take(indexer)
     return result
 
 

diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -53,7 +53,6 @@ class ExtensionArray(object):
     * unique
     * factorize / _values_for_factorize
     * argsort / _values_for_argsort
-    * take / _values_for_take
 
     This class does not inherit from 'abc.ABCMeta' for performance reasons.
     Methods and properties required by the interface raise
@@ -277,22 +276,23 @@ def isna(self):
         """
         raise AbstractMethodError(self)
 
-    def _values_for_argsort(self):
-        # type: () -> ndarray
-        """Return values for sorting.
+    def _values_for_factorize(self):
+        # type: () -> Tuple[ndarray, Any]
+        """Return an array and missing value suitable for factorization.
 
         Returns
         -------
-        ndarray
-            The transformed values should maintain the ordering between values
-            within the array.
-
-        See Also
-        --------
-        ExtensionArray.argsort
+        values : ndarray
+            An array suitable for factoraization. This should maintain order
+            and be a supported dtype (Float64, Int64, UInt64, String, Object).
+            By default, the extension array is cast to object dtype.
+        na_value : object
+            The value in `values` to consider missing. This will be treated
+            as NA in the factorization routines, so it will be coded as
+            `na_sentinal` and not included in `uniques`. By default,
+            ``np.nan`` is used.
         """
-        # Note: this is used in `ExtensionArray.argsort`.
-        return np.array(self)
+        return self.astype(object), np.nan
 
     def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
         """
@@ -393,24 +393,6 @@ def unique(self):
         uniques = unique(self.astype(object))
         return self._from_sequence(uniques)
 
-    def _values_for_factorize(self):
-        # type: () -> Tuple[ndarray, Any]
-        """Return an array and missing value suitable for factorization.
-
-        Returns
-        -------
-        values : ndarray
-            An array suitable for factoraization. This should maintain order
-            and be a supported dtype (Float64, Int64, UInt64, String, Object).
-            By default, the extension array is cast to object dtype.
-        na_value : object
-            The value in `values` to consider missing. This will be treated
-            as NA in the factorization routines, so it will be coded as
-            `na_sentinal` and not included in `uniques`. By default,
-            ``np.nan`` is used.
-        """
-        return self.astype(object), np.nan
-
     def factorize(self, na_sentinel=-1):
         # type: (int) -> Tuple[ndarray, ExtensionArray]
         """Encode the extension array as an enumerated type.
@@ -463,40 +445,45 @@ def factorize(self, na_sentinel=-1):
     # ------------------------------------------------------------------------
     # Indexing methods
     # ------------------------------------------------------------------------
-    def _values_for_take(self):
-        """Values to use for `take`.
-
-        Coerces to object dtype by default.
+    def _values_for_argsort(self):
+        # type: () -> ndarray
+        """Return values for sorting.
 
         Returns
         -------
-        array-like
-            Must satisify NumPy's `take` semantics.
+        ndarray
+            The transformed values should maintain the ordering between values
+            within the array.
+
+        See Also
+        --------
+        ExtensionArray.argsort
         """
-        return self.astype(object)
+        # Note: this is used in `ExtensionArray.argsort`.
+        return np.array(self)
 
-    def take(self, indexer, fill_value=None, allow_fill=None):
-        # type: (Sequence[int], Optional[Any], bool) -> ExtensionArray
+    def take(self, indexer, allow_fill=False, fill_value=None):
+        # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray
         """Take elements from an array.
 
         Parameters
         ----------
         indexer : sequence of integers
             Indices to be taken. See Notes for how negative indicies
             are handled.
+        allow_fill : bool, default False
+            How to handle negative values in `indexer`.
+
+            For False values (the default), negative values in `indexer`
+            indiciate slices from the right.
+
+            For True values, indicies where `indexer` is ``-1`` indicate
+            missing values. These values are set to `fill_value`. Any other
+            other negative value should raise a ``ValueError``.
         fill_value : any, optional
             Fill value to use for NA-indicies when `allow_fill` is True.
             This may be ``None``, in which case the default NA value for
             the type, ``self.dtype.na_value``, is used.
-        allow_fill : bool, optional
-            How to handle negative values in `indexer`.
-
-            For False values (the default), NumPy's behavior is used. Negative
-            values in `indexer` mean slices from the right.
-
-            For True values, Pandas behavior is used. Indicies where `indexer`
-            is ``-1`` are set to `fill_value`. Any other negative value should
-            raise a ``ValueError``.
 
         Returns
         -------
@@ -514,21 +501,34 @@ def take(self, indexer, fill_value=None, allow_fill=None):
         -----
         ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
         ``iloc``, when the indexer is a sequence of values. Additionally,
-        it's called by :meth:`Series.reindex` with a `fill_value`.
+        it's called by :meth:`Series.reindex`, or any other method
+        that causes realignemnt, with a `fill_value`.
 
         See Also
         --------
         numpy.take
-        """
-        from pandas.core.algorithms import take
+        pandas.api.extensions.take
+
+        Examples
+        --------
+        Here's an example implementation, which relies on casting the
+        extension array to object dtype. This uses the helper method
+        :func:`pandas.api.extensions.take`.
 
-        data = self._values_for_take()
-        if allow_fill and fill_value is None:
-            fill_value = self.dtype.na_value
+        .. code-block:: python
 
-        result = take(data, indexer, fill_value=fill_value,
-                      allow_fill=allow_fill)
-        return self._from_sequence(result)
+           def take(self, indexer, allow_fill=False, fill_value=None):
+               from pandas.core.algorithms import take
+
+               data = self.astype(object)
+               if allow_fill and fill_value is None:
+                   fill_value = self.dtype.na_value
+
+               result = take(data, indexer, fill_value=fill_value,
+                             allow_fill=allow_fill)
+               return self._from_sequence(result)
+        """
+        raise AbstractMethodError(self)
 
     def copy(self, deep=False):
         # type: (bool) -> ExtensionArray

diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py
@@ -18,7 +18,7 @@ class _DtypeOpsMixin(object):
 
     # na_value is the default NA value to use for this type. This is used in
     # e.g. ExtensionArray.take.
-    na_value = np.nan  # TODO: change to _na_value
+    na_value = np.nan
 
     def __eq__(self, other):
         """Check whether 'other' is equal to self.
@@ -105,6 +105,9 @@ class ExtensionDtype(_DtypeOpsMixin):
     * name
     * construct_from_string
 
+    The `na_value` class attribute can be used to set the default NA value
+    for this type. :attr:`numpy.nan` is used by default.
+
     This class does not inherit from 'abc.ABCMeta' for performance reasons.
     Methods and properties required by the interface raise
     ``pandas.errors.AbstractMethodError`` and no ``register`` method is

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -256,11 +256,7 @@ def changeit():
 
 def maybe_promote(dtype, fill_value=np.nan):
     # if we passed an array here, determine the fill value by dtype
-    if is_extension_array_dtype(dtype):
-        # XXX: verify this change
-        fill_value = dtype.na_value
-
-    elif isinstance(fill_value, np.ndarray):
+    if isinstance(fill_value, np.ndarray):
         if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)):
             fill_value = iNaT
         else:
@@ -297,6 +293,8 @@ def maybe_promote(dtype, fill_value=np.nan):
     elif is_datetimetz(dtype):
         if isna(fill_value):
             fill_value = iNaT
+    elif is_extension_array_dtype(dtype) and isna(fill_value):
+        fill_value = dtype.na_value
     elif is_float(fill_value):
         if issubclass(dtype.type, np.bool_):
             dtype = np.object_

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -5445,7 +5445,7 @@ def is_uniform_join_units(join_units):
 
 def is_uniform_reindex(join_units):
     return (
-        # TODO: should this be ju.block.can_hold_na?
+        # TODO: should this be ju.block._can_hold_na?
         all(ju.block and ju.block.is_extension for ju in join_units) and
         len(set(ju.block.dtype.name for ju in join_units)) == 1
     )

diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py
@@ -149,8 +149,11 @@ def test_take_negative(self, data):
 
     def test_take_non_na_fill_value(self, data_missing):
         fill_value = data_missing[1]  # valid
-        result = data_missing.take([-1, 1], fill_value=fill_value)
-        expected = data_missing.take([1, 1])
+        na = data_missing[0]
+
+        array = data_missing._from_sequence([na, fill_value, na])
+        result = array.take([-1, 1], fill_value=fill_value, allow_fill=True)
+        expected = array.take([1, 1])
         self.assert_extension_array_equal(result, expected)
 
     def test_take_pandas_style_negative_raises(self, data, na_value):

diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py
@@ -124,11 +124,11 @@ def test_merge(self, data, na_value):
                             'key': [0, 1, 2]})
         df2 = pd.DataFrame({'int2': [1, 2, 3, 4], 'key': [0, 0, 1, 3]})
 
-        # res = pd.merge(df1, df2)
-        # exp = pd.DataFrame(
-        #     {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1],
-        #      'ext': data._from_sequence([data[0], data[0], data[1]])})
-        # self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])
+        res = pd.merge(df1, df2)
+        exp = pd.DataFrame(
+            {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1],
+             'ext': data._from_sequence([data[0], data[0], data[1]])})
+        self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])
 
         res = pd.merge(df1, df2, how='outer')
         exp = pd.DataFrame(

diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py
@@ -53,6 +53,17 @@ def __getitem__(self, item):
         else:
             return type(self)(self._data[item])
 
+    def take(self, indexer, allow_fill=False, fill_value=None):
+        from pandas.api.extensions import take
+
+        data = self._data
+        if allow_fill and fill_value is None:
+            fill_value = self.dtype.na_value
+
+        result = take(data, indexer, fill_value=fill_value,
+                      allow_fill=allow_fill)
+        return self._from_sequence(result)
+
     def copy(self, deep=False):
         if deep:
             return type(self)(self._data.copy())
@@ -81,9 +92,6 @@ def nbytes(self):
     def isna(self):
         return np.array([x.is_nan() for x in self._data])
 
-    def _values_for_take(self):
-        return self.data
-
     @property
     def _na_value(self):
         return decimal.Decimal('NaN')

diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py
@@ -108,26 +108,7 @@ class TestReshaping(BaseDecimal, base.BaseReshapingTests):
 
 
 class TestGetitem(BaseDecimal, base.BaseGetitemTests):
-
-    def test_take_basic(self):
-        ea = DecimalArray([decimal.Decimal('1'),
-                           decimal.Decimal('2'),
-                           decimal.Decimal('3')])
-        result = ea.take([1, 2, -1])
-        expected = DecimalArray([decimal.Decimal('2'),
-                                 decimal.Decimal('3'),
-                                 decimal.Decimal('3')])
-        self.assert_extension_array_equal(result, expected)
-
-        result = ea.take([1, 2, -1], fill_value=ea.dtype.na_value,
-                         allow_fill=True)
-        expected = DecimalArray([decimal.Decimal('2'),
-                                 decimal.Decimal('3'),
-                                 decimal.Decimal('NaN')])
-        self.assert_extension_array_equal(result, expected)
-
-        result = pd.Series(ea).reindex([1, 2, -1]).values
-        self.assert_extension_array_equal(result, expected)
+    pass
 
 
 class TestMissing(BaseDecimal, base.BaseMissingTests):