Skip to content

Commit

Permalink
Updates
Browse files Browse the repository at this point in the history
1. Reversed order of take keywords
2. Added to extensions API
3. Removed default implementation
  • Loading branch information
TomAugspurger committed Apr 25, 2018
1 parent 05d8844 commit 69e7fe7
Show file tree
Hide file tree
Showing 12 changed files with 147 additions and 104 deletions.
1 change: 1 addition & 0 deletions pandas/api/extensions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@
from pandas.core.accessor import (register_dataframe_accessor, # noqa
register_index_accessor,
register_series_accessor)
from pandas.core.algorithms import take # noqa
from pandas.core.arrays.base import ExtensionArray # noqa
from pandas.core.dtypes.dtypes import ExtensionDtype # noqa
53 changes: 48 additions & 5 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1448,24 +1448,67 @@ def func(arr, indexer, out, fill_value=np.nan):
return func


def take(arr, indexer, fill_value=None, allow_fill=None):
def take(arr, indexer, allow_fill=False, fill_value=None):
"""Take elements from an array.
Parameters
----------
arr : ndarray or ExtensionArray
indexer : sequence of integers
Indices to be taken. See Notes for how negative indicies
are handled.
allow_fill : bool, default False
How to handle negative values in `indexer`.
For False values (the default), negative values in `indexer`
indiciate slices from the right.
For True values, indicies where `indexer` is ``-1`` indicate
missing values. These values are set to `fill_value`. Any other
other negative value should raise a ``ValueError``.
fill_value : any, optional
Fill value to use for NA-indicies when `allow_fill` is True.
This may be ``None``, in which case the default NA value for
the type, ``self.dtype.na_value``, is used.
Returns
-------
ndarray or ExtensionArray
Same type as the input.
Raises
------
IndexError
When the indexer is out of bounds for the array.
ValueError
When the indexer contains negative values other than ``-1``
and `allow_fill` is True.
See Also
--------
numpy.take
"""
indexer = np.asarray(indexer)

if allow_fill is None:
# NumPy style
result = arr.take(indexer)
else:
if allow_fill:
# Pandas style, -1 means NA
# bounds checking
if (indexer < -1).any():
raise ValueError("Invalid value in 'indexer'. All values "
"must be non-negative or -1. When "
"'fill_value' is specified.")
if (indexer > len(arr)).any():
# TODO: reuse with logic elsewhere.
raise IndexError

# # take on empty array not handled as desired by numpy
# # in case of -1 (all missing take)
# if not len(arr) and mask.all():
# return arr._from_sequence([fill_value] * len(indexer))
result = take_1d(arr, indexer, fill_value=fill_value)
else:
# NumPy style
result = arr.take(indexer)
return result


Expand Down
116 changes: 58 additions & 58 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ class ExtensionArray(object):
* unique
* factorize / _values_for_factorize
* argsort / _values_for_argsort
* take / _values_for_take
This class does not inherit from 'abc.ABCMeta' for performance reasons.
Methods and properties required by the interface raise
Expand Down Expand Up @@ -277,22 +276,23 @@ def isna(self):
"""
raise AbstractMethodError(self)

def _values_for_argsort(self):
# type: () -> ndarray
"""Return values for sorting.
def _values_for_factorize(self):
# type: () -> Tuple[ndarray, Any]
"""Return an array and missing value suitable for factorization.
Returns
-------
ndarray
The transformed values should maintain the ordering between values
within the array.
See Also
--------
ExtensionArray.argsort
values : ndarray
An array suitable for factoraization. This should maintain order
and be a supported dtype (Float64, Int64, UInt64, String, Object).
By default, the extension array is cast to object dtype.
na_value : object
The value in `values` to consider missing. This will be treated
as NA in the factorization routines, so it will be coded as
`na_sentinal` and not included in `uniques`. By default,
``np.nan`` is used.
"""
# Note: this is used in `ExtensionArray.argsort`.
return np.array(self)
return self.astype(object), np.nan

def argsort(self, ascending=True, kind='quicksort', *args, **kwargs):
"""
Expand Down Expand Up @@ -393,24 +393,6 @@ def unique(self):
uniques = unique(self.astype(object))
return self._from_sequence(uniques)

def _values_for_factorize(self):
# type: () -> Tuple[ndarray, Any]
"""Return an array and missing value suitable for factorization.
Returns
-------
values : ndarray
An array suitable for factoraization. This should maintain order
and be a supported dtype (Float64, Int64, UInt64, String, Object).
By default, the extension array is cast to object dtype.
na_value : object
The value in `values` to consider missing. This will be treated
as NA in the factorization routines, so it will be coded as
`na_sentinal` and not included in `uniques`. By default,
``np.nan`` is used.
"""
return self.astype(object), np.nan

def factorize(self, na_sentinel=-1):
# type: (int) -> Tuple[ndarray, ExtensionArray]
"""Encode the extension array as an enumerated type.
Expand Down Expand Up @@ -463,40 +445,45 @@ def factorize(self, na_sentinel=-1):
# ------------------------------------------------------------------------
# Indexing methods
# ------------------------------------------------------------------------
def _values_for_take(self):
"""Values to use for `take`.
Coerces to object dtype by default.
def _values_for_argsort(self):
# type: () -> ndarray
"""Return values for sorting.
Returns
-------
array-like
Must satisify NumPy's `take` semantics.
ndarray
The transformed values should maintain the ordering between values
within the array.
See Also
--------
ExtensionArray.argsort
"""
return self.astype(object)
# Note: this is used in `ExtensionArray.argsort`.
return np.array(self)

def take(self, indexer, fill_value=None, allow_fill=None):
# type: (Sequence[int], Optional[Any], bool) -> ExtensionArray
def take(self, indexer, allow_fill=False, fill_value=None):
# type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray
"""Take elements from an array.
Parameters
----------
indexer : sequence of integers
Indices to be taken. See Notes for how negative indicies
are handled.
allow_fill : bool, default False
How to handle negative values in `indexer`.
For False values (the default), negative values in `indexer`
indiciate slices from the right.
For True values, indicies where `indexer` is ``-1`` indicate
missing values. These values are set to `fill_value`. Any other
other negative value should raise a ``ValueError``.
fill_value : any, optional
Fill value to use for NA-indicies when `allow_fill` is True.
This may be ``None``, in which case the default NA value for
the type, ``self.dtype.na_value``, is used.
allow_fill : bool, optional
How to handle negative values in `indexer`.
For False values (the default), NumPy's behavior is used. Negative
values in `indexer` mean slices from the right.
For True values, Pandas behavior is used. Indicies where `indexer`
is ``-1`` are set to `fill_value`. Any other negative value should
raise a ``ValueError``.
Returns
-------
Expand All @@ -514,21 +501,34 @@ def take(self, indexer, fill_value=None, allow_fill=None):
-----
ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``,
``iloc``, when the indexer is a sequence of values. Additionally,
it's called by :meth:`Series.reindex` with a `fill_value`.
it's called by :meth:`Series.reindex`, or any other method
that causes realignemnt, with a `fill_value`.
See Also
--------
numpy.take
"""
from pandas.core.algorithms import take
pandas.api.extensions.take
Examples
--------
Here's an example implementation, which relies on casting the
extension array to object dtype. This uses the helper method
:func:`pandas.api.extensions.take`.
data = self._values_for_take()
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value
.. code-block:: python
result = take(data, indexer, fill_value=fill_value,
allow_fill=allow_fill)
return self._from_sequence(result)
def take(self, indexer, allow_fill=False, fill_value=None):
from pandas.core.algorithms import take
data = self.astype(object)
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value
result = take(data, indexer, fill_value=fill_value,
allow_fill=allow_fill)
return self._from_sequence(result)
"""
raise AbstractMethodError(self)

def copy(self, deep=False):
# type: (bool) -> ExtensionArray
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/dtypes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class _DtypeOpsMixin(object):

# na_value is the default NA value to use for this type. This is used in
# e.g. ExtensionArray.take.
na_value = np.nan # TODO: change to _na_value
na_value = np.nan

def __eq__(self, other):
"""Check whether 'other' is equal to self.
Expand Down Expand Up @@ -105,6 +105,9 @@ class ExtensionDtype(_DtypeOpsMixin):
* name
* construct_from_string
The `na_value` class attribute can be used to set the default NA value
for this type. :attr:`numpy.nan` is used by default.
This class does not inherit from 'abc.ABCMeta' for performance reasons.
Methods and properties required by the interface raise
``pandas.errors.AbstractMethodError`` and no ``register`` method is
Expand Down
8 changes: 3 additions & 5 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,11 +256,7 @@ def changeit():

def maybe_promote(dtype, fill_value=np.nan):
# if we passed an array here, determine the fill value by dtype
if is_extension_array_dtype(dtype):
# XXX: verify this change
fill_value = dtype.na_value

elif isinstance(fill_value, np.ndarray):
if isinstance(fill_value, np.ndarray):
if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)):
fill_value = iNaT
else:
Expand Down Expand Up @@ -297,6 +293,8 @@ def maybe_promote(dtype, fill_value=np.nan):
elif is_datetimetz(dtype):
if isna(fill_value):
fill_value = iNaT
elif is_extension_array_dtype(dtype) and isna(fill_value):
fill_value = dtype.na_value
elif is_float(fill_value):
if issubclass(dtype.type, np.bool_):
dtype = np.object_
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -5445,7 +5445,7 @@ def is_uniform_join_units(join_units):

def is_uniform_reindex(join_units):
return (
# TODO: should this be ju.block.can_hold_na?
# TODO: should this be ju.block._can_hold_na?
all(ju.block and ju.block.is_extension for ju in join_units) and
len(set(ju.block.dtype.name for ju in join_units)) == 1
)
Expand Down
7 changes: 5 additions & 2 deletions pandas/tests/extension/base/getitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,11 @@ def test_take_negative(self, data):

def test_take_non_na_fill_value(self, data_missing):
fill_value = data_missing[1] # valid
result = data_missing.take([-1, 1], fill_value=fill_value)
expected = data_missing.take([1, 1])
na = data_missing[0]

array = data_missing._from_sequence([na, fill_value, na])
result = array.take([-1, 1], fill_value=fill_value, allow_fill=True)
expected = array.take([1, 1])
self.assert_extension_array_equal(result, expected)

def test_take_pandas_style_negative_raises(self, data, na_value):
Expand Down
10 changes: 5 additions & 5 deletions pandas/tests/extension/base/reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,11 @@ def test_merge(self, data, na_value):
'key': [0, 1, 2]})
df2 = pd.DataFrame({'int2': [1, 2, 3, 4], 'key': [0, 0, 1, 3]})

# res = pd.merge(df1, df2)
# exp = pd.DataFrame(
# {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1],
# 'ext': data._from_sequence([data[0], data[0], data[1]])})
# self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])
res = pd.merge(df1, df2)
exp = pd.DataFrame(
{'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1],
'ext': data._from_sequence([data[0], data[0], data[1]])})
self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']])

res = pd.merge(df1, df2, how='outer')
exp = pd.DataFrame(
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/extension/decimal/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,17 @@ def __getitem__(self, item):
else:
return type(self)(self._data[item])

def take(self, indexer, allow_fill=False, fill_value=None):
from pandas.api.extensions import take

data = self._data
if allow_fill and fill_value is None:
fill_value = self.dtype.na_value

result = take(data, indexer, fill_value=fill_value,
allow_fill=allow_fill)
return self._from_sequence(result)

def copy(self, deep=False):
if deep:
return type(self)(self._data.copy())
Expand Down Expand Up @@ -81,9 +92,6 @@ def nbytes(self):
def isna(self):
return np.array([x.is_nan() for x in self._data])

def _values_for_take(self):
return self.data

@property
def _na_value(self):
return decimal.Decimal('NaN')
Expand Down
21 changes: 1 addition & 20 deletions pandas/tests/extension/decimal/test_decimal.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,26 +108,7 @@ class TestReshaping(BaseDecimal, base.BaseReshapingTests):


class TestGetitem(BaseDecimal, base.BaseGetitemTests):

def test_take_basic(self):
ea = DecimalArray([decimal.Decimal('1'),
decimal.Decimal('2'),
decimal.Decimal('3')])
result = ea.take([1, 2, -1])
expected = DecimalArray([decimal.Decimal('2'),
decimal.Decimal('3'),
decimal.Decimal('3')])
self.assert_extension_array_equal(result, expected)

result = ea.take([1, 2, -1], fill_value=ea.dtype.na_value,
allow_fill=True)
expected = DecimalArray([decimal.Decimal('2'),
decimal.Decimal('3'),
decimal.Decimal('NaN')])
self.assert_extension_array_equal(result, expected)

result = pd.Series(ea).reindex([1, 2, -1]).values
self.assert_extension_array_equal(result, expected)
pass


class TestMissing(BaseDecimal, base.BaseMissingTests):
Expand Down
Loading

0 comments on commit 69e7fe7

Please sign in to comment.