diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index cc09204d992d8..3e6e192a3502c 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -2,5 +2,6 @@ from pandas.core.accessor import (register_dataframe_accessor, # noqa register_index_accessor, register_series_accessor) +from pandas.core.algorithms import take # noqa from pandas.core.arrays.base import ExtensionArray # noqa from pandas.core.dtypes.dtypes import ExtensionDtype # noqa diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index cbc412d74d51d..a03d892432b51 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1448,6 +1448,94 @@ def func(arr, indexer, out, fill_value=np.nan): return func +def take(arr, indices, allow_fill=False, fill_value=None): + """ + Take elements from an array. + + .. versionadded:: 0.23.0 + + Parameters + ---------- + arr : sequence + Non array-likes (sequences without a dtype) are coereced + to an ndarray. + indices : sequence of integers + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type (``self.dtype.na_value``) is used. + + Returns + ------- + ndarray or ExtensionArray + Same type as the input. + + Raises + ------ + IndexError + When `indices` is out of bounds for the array. + ValueError + When the indexer contains negative values other than ``-1`` + and `allow_fill` is True. + + Notes + ----- + When `allow_fill` is False, `indices` may be whatever dimensionality + is accepted by NumPy for `arr`. + + When `allow_fill` is True, `indices` should be 1-D. + + See Also + -------- + numpy.take + + Examples + -------- + >>> from pandas.api.extensions import take + + With the default ``allow_fill=False``, negative numbers indicate + positional indices from the right. + + >>> take(np.array([10, 20, 30]), [0, 0, -1]) + array([10, 10, 30]) + + Setting ``allow_fill=True`` will place `fill_value` in those positions. + + >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True) + array([10., 10., nan]) + + >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, + ... fill_value=-10) + array([ 10, 10, -10]) + """ + from pandas.core.indexing import validate_indices + + if not is_array_like(arr): + arr = np.asarray(arr) + + indices = np.asarray(indices, dtype=np.intp) + + if allow_fill: + # Pandas style, -1 means NA + validate_indices(indices, len(arr)) + result = take_1d(arr, indices, allow_fill=True, fill_value=fill_value) + else: + # NumPy style + result = arr.take(indices) + return result + + def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True): """ @@ -1462,7 +1550,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, Input array. indexer : ndarray 1-D array of indices to take, subarrays corresponding to -1 value - indicies are filed with fill_value + indices are filed with fill_value axis : int, default 0 Axis to take from out : ndarray or None, default None diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f1a81b5eefddd..1922801c30719 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -462,22 +462,36 @@ def factorize(self, na_sentinel=-1): # ------------------------------------------------------------------------ # Indexing methods # ------------------------------------------------------------------------ - def take(self, indexer, allow_fill=True, fill_value=None): + + def take(self, indices, allow_fill=False, fill_value=None): # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray """Take elements from an array. Parameters ---------- - indexer : sequence of integers - indices to be taken. -1 is used to indicate values - that are missing. - allow_fill : bool, default True - If False, indexer is assumed to contain no -1 values so no filling - will be done. This short-circuits computation of a mask. Result is - undefined if allow_fill == False and -1 is present in indexer. - fill_value : any, default None - Fill value to replace -1 values with. If applicable, this should - use the sentinel missing value for this type. + indices : sequence of integers + Indices to be taken. + allow_fill : bool, default False + How to handle negative values in `indices`. + + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + fill_value : any, optional + Fill value to use for NA-indices when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if nescessary. Returns ------- @@ -486,44 +500,56 @@ def take(self, indexer, allow_fill=True, fill_value=None): Raises ------ IndexError - When the indexer is out of bounds for the array. + When the indices are out of bounds for the array. + ValueError + When `indices` contains negative values other than ``-1`` + and `allow_fill` is True. Notes ----- - This should follow pandas' semantics where -1 indicates missing values. - Positions where indexer is ``-1`` should be filled with the missing - value for this type. - This gives rise to the special case of a take on an empty - ExtensionArray that does not raises an IndexError straight away - when the `indexer` is all ``-1``. + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when `indices` is a sequence of values. Additionally, + it's called by :meth:`Series.reindex`, or any other method + that causes realignemnt, with a `fill_value`. - This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the - indexer is a sequence of values. + See Also + -------- + numpy.take + pandas.api.extensions.take Examples -------- - Suppose the extension array is backed by a NumPy array stored as - ``self.data``. Then ``take`` may be written as + Here's an example implementation, which relies on casting the + extension array to object dtype. This uses the helper method + :func:`pandas.api.extensions.take`. .. code-block:: python - def take(self, indexer, allow_fill=True, fill_value=None): - indexer = np.asarray(indexer) - mask = indexer == -1 + def take(self, indices, allow_fill=False, fill_value=None): + from pandas.core.algorithms import take - # take on empty array not handled as desired by numpy - # in case of -1 (all missing take) - if not len(self) and mask.all(): - return type(self)([np.nan] * len(indexer)) + # If the ExtensionArray is backed by an ndarray, then + # just pass that here instead of coercing to object. + data = self.astype(object) - result = self.data.take(indexer) - result[mask] = np.nan # NA for this type - return type(self)(result) + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value - See Also - -------- - numpy.take + # fill value should always be translated from the scalar + # type for the array, to the physical storage type for + # the data, before passing to take. + + result = take(data, indices, fill_value=fill_value, + allow_fill=allow_fill) + return self._from_sequence(result) """ + # Implementer note: The `fill_value` parameter should be a user-facing + # value, an instance of self.dtype.type. When passed `fill_value=None`, + # the default of `self.dtype.na_value` should be used. + # This may differ from the physical storage type your ExtensionArray + # uses. In this case, your implementation is responsible for casting + # the user-facing type to the storage type, before using + # pandas.api.extensions.take raise AbstractMethodError(self) def copy(self, deep=False): diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 6dbed5f138d5d..49e98c16c716e 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -16,6 +16,12 @@ class _DtypeOpsMixin(object): # classes will inherit from this Mixin. Once everything is compatible, this # class's methods can be moved to ExtensionDtype and removed. + # na_value is the default NA value to use for this type. This is used in + # e.g. ExtensionArray.take. This should be the user-facing "boxed" version + # of the NA value, not the physical NA vaalue for storage. + # e.g. for JSONArray, this is an empty dictionary. + na_value = np.nan + def __eq__(self, other): """Check whether 'other' is equal to self. @@ -92,6 +98,8 @@ def is_dtype(cls, dtype): class ExtensionDtype(_DtypeOpsMixin): """A custom data type, to be paired with an ExtensionArray. + .. versionadded:: 0.23.0 + Notes ----- The interface includes the following abstract methods that must @@ -101,6 +109,9 @@ class ExtensionDtype(_DtypeOpsMixin): * name * construct_from_string + The `na_value` class attribute can be used to set the default NA value + for this type. :attr:`numpy.nan` is used by default. + This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise ``pandas.errors.AbstractMethodError`` and no ``register`` method is diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 75434fcc2b40d..e4ed6d544d42e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -255,7 +255,6 @@ def changeit(): def maybe_promote(dtype, fill_value=np.nan): - # if we passed an array here, determine the fill value by dtype if isinstance(fill_value, np.ndarray): if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): @@ -294,6 +293,8 @@ def maybe_promote(dtype, fill_value=np.nan): elif is_datetimetz(dtype): if isna(fill_value): fill_value = iNaT + elif is_extension_array_dtype(dtype) and isna(fill_value): + fill_value = dtype.na_value elif is_float(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.object_ diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 2c98cedd7d715..3b2336bf19547 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -502,6 +502,8 @@ def na_value_for_dtype(dtype, compat=True): """ dtype = pandas_dtype(dtype) + if is_extension_array_dtype(dtype): + return dtype.na_value if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or is_timedelta64_dtype(dtype) or is_period_dtype(dtype)): return NaT diff --git a/pandas/core/frame.py b/pandas/core/frame.py index de6985ef3b4ea..82d5a0286b117 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3476,7 +3476,7 @@ def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan, allow_dups=False) def _reindex_columns(self, new_columns, method, copy, level, - fill_value=np.nan, limit=None, tolerance=None): + fill_value=None, limit=None, tolerance=None): new_columns, indexer = self.columns.reindex(new_columns, method=method, level=level, limit=limit, tolerance=tolerance) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 86342b6996abf..6d55f92167d3b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3660,7 +3660,7 @@ def reindex(self, *args, **kwargs): copy = kwargs.pop('copy', True) limit = kwargs.pop('limit', None) tolerance = kwargs.pop('tolerance', None) - fill_value = kwargs.pop('fill_value', np.nan) + fill_value = kwargs.pop('fill_value', None) # Series.reindex doesn't use / need the axis kwarg # We pop and ignore it here, to make writing Series/Frame generic code @@ -3776,7 +3776,7 @@ def _reindex_multi(self, axes, copy, fill_value): @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs) def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, - limit=None, fill_value=np.nan): + limit=None, fill_value=None): msg = ("'.reindex_axis' is deprecated and will be removed in a future " "version. Use '.reindex' instead.") self._consolidate_inplace() @@ -3790,7 +3790,7 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, return self._reindex_with_indexers({axis: [new_index, indexer]}, fill_value=fill_value, copy=copy) - def _reindex_with_indexers(self, reindexers, fill_value=np.nan, copy=False, + def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False, allow_dups=False): """allow_dups indicates an internal call here """ @@ -7252,7 +7252,7 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, raise TypeError('unsupported type: %s' % type(other)) def _align_frame(self, other, join='outer', axis=None, level=None, - copy=True, fill_value=np.nan, method=None, limit=None, + copy=True, fill_value=None, method=None, limit=None, fill_axis=0): # defaults join_index, join_columns = None, None diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 2eb52ecc6bcc7..fe6d6775c4e0b 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2417,12 +2417,53 @@ def maybe_convert_indices(indices, n): mask = indices < 0 if mask.any(): indices[mask] += n + mask = (indices >= n) | (indices < 0) if mask.any(): raise IndexError("indices are out-of-bounds") return indices +def validate_indices(indices, n): + """Perform bounds-checking for an indexer. + + -1 is allowed for indicating missing values. + + Parameters + ---------- + indices : ndarray + n : int + length of the array being indexed + + Raises + ------ + ValueError + + Examples + -------- + >>> validate_indices([1, 2], 3) + # OK + >>> validate_indices([1, -2], 3) + ValueError + >>> validate_indices([1, 2, 3], 3) + IndexError + >>> validate_indices([-1, -1], 0) + # OK + >>> validate_indices([0, 1], 0) + IndexError + """ + if len(indices): + min_idx = indices.min() + if min_idx < -1: + msg = ("'indices' contains values less than allowed ({} < {})" + .format(min_idx, -1)) + raise ValueError(msg) + + max_idx = indices.max() + if max_idx >= n: + raise IndexError("indices are out-of-bounds") + + def maybe_convert_ix(*args): """ We likely want to take the cross-product diff --git a/pandas/core/internals.py b/pandas/core/internals.py index a266ea620bd9f..474894aba65df 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1888,6 +1888,11 @@ def _holder(self): # For extension blocks, the holder is values-dependent. return type(self.values) + @property + def fill_value(self): + # Used in reindex_indexer + return self.values.dtype.na_value + @property def _can_hold_na(self): # The default ExtensionArray._can_hold_na is True @@ -1951,7 +1956,8 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): # axis doesn't matter; we are really a single-dim object # but are passed the axis depending on the calling routing # if its REALLY axis 0, then this will be a reindex and not a take - new_values = self.values.take(indexer, fill_value=fill_value) + new_values = self.values.take(indexer, fill_value=fill_value, + allow_fill=True) # if we are a 1-dim object, then always place at 0 if self.ndim == 1: @@ -5440,6 +5446,14 @@ def is_uniform_join_units(join_units): len(join_units) > 1) +def is_uniform_reindex(join_units): + return ( + # TODO: should this be ju.block._can_hold_na? + all(ju.block and ju.block.is_extension for ju in join_units) and + len(set(ju.block.dtype.name for ju in join_units)) == 1 + ) + + def get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. @@ -5457,6 +5471,12 @@ def get_empty_dtype_and_na(join_units): if blk is None: return np.float64, np.nan + if is_uniform_reindex(join_units): + # XXX: integrate property + empty_dtype = join_units[0].block.dtype + upcasted_na = join_units[0].block.fill_value + return empty_dtype, upcasted_na + has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): diff --git a/pandas/core/series.py b/pandas/core/series.py index f2ee225f50514..7abd95c68ea2b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3216,7 +3216,8 @@ def _reindex_indexer(self, new_index, indexer, copy): return self.copy() return self - new_values = algorithms.take_1d(self._values, indexer) + new_values = algorithms.take_1d(self._values, indexer, + allow_fill=True, fill_value=None) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index ac156900671a6..5c9ede1079079 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -127,20 +127,53 @@ def test_take(self, data, na_value, na_cmp): result = data.take([0, -1]) assert result.dtype == data.dtype assert result[0] == data[0] - na_cmp(result[1], na_value) + assert result[1] == data[-1] + + result = data.take([0, -1], allow_fill=True, fill_value=na_value) + assert result[0] == data[0] + assert na_cmp(result[1], na_value) with tm.assert_raises_regex(IndexError, "out of bounds"): data.take([len(data) + 1]) def test_take_empty(self, data, na_value, na_cmp): empty = data[:0] - result = empty.take([-1]) - na_cmp(result[0], na_value) + + result = empty.take([-1], allow_fill=True) + assert na_cmp(result[0], na_value) + + with pytest.raises(IndexError): + empty.take([-1]) with tm.assert_raises_regex(IndexError, "cannot do a non-empty take"): empty.take([0, 1]) - @pytest.mark.xfail(reason="Series.take with extension array buggy for -1") + def test_take_negative(self, data): + # https://github.com/pandas-dev/pandas/issues/20640 + n = len(data) + result = data.take([0, -n, n - 1, -1]) + expected = data.take([0, 0, n - 1, n - 1]) + self.assert_extension_array_equal(result, expected) + + def test_take_non_na_fill_value(self, data_missing): + fill_value = data_missing[1] # valid + na = data_missing[0] + + array = data_missing._from_sequence([na, fill_value, na]) + result = array.take([-1, 1], fill_value=fill_value, allow_fill=True) + expected = array.take([1, 1]) + self.assert_extension_array_equal(result, expected) + + def test_take_pandas_style_negative_raises(self, data, na_value): + with pytest.raises(ValueError): + data.take([0, -2], fill_value=na_value, allow_fill=True) + + @pytest.mark.parametrize('allow_fill', [True, False]) + def test_take_out_of_bounds_raises(self, data, allow_fill): + arr = data[:3] + with pytest.raises(IndexError): + arr.take(np.asarray([0, 3]), allow_fill=allow_fill) + def test_take_series(self, data): s = pd.Series(data) result = s.take([0, -1]) @@ -166,3 +199,14 @@ def test_reindex(self, data, na_value): expected = pd.Series(data._from_sequence([na_value, na_value]), index=[n, n + 1]) self.assert_series_equal(result, expected) + + def test_reindex_non_na_fill_value(self, data_missing): + valid = data_missing[1] + na = data_missing[0] + + array = data_missing._from_sequence([na, valid]) + ser = pd.Series(array) + result = ser.reindex([0, 1, 2], fill_value=valid) + expected = pd.Series(data_missing._from_sequence([na, valid, valid])) + + self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 6ebe700f13be0..c34339c99322d 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -81,6 +81,8 @@ def test_merge(self, data, na_value): class TestGetitem(base.BaseGetitemTests): + skip_take = pytest.mark.skip(reason="GH-20664.") + @pytest.mark.skip(reason="Backwards compatibility") def test_getitem_scalar(self): # CategoricalDtype.type isn't "correct" since it should @@ -88,11 +90,35 @@ def test_getitem_scalar(self): # to break things by changing. pass - @pytest.mark.xfail(reason="Categorical.take buggy") + @skip_take def test_take(self): # TODO remove this once Categorical.take is fixed pass + @skip_take + def test_take_negative(self): + pass + + @skip_take + def test_take_pandas_style_negative_raises(self): + pass + + @skip_take + def test_take_non_na_fill_value(self): + pass + + @skip_take + def test_take_out_of_bounds_raises(self): + pass + + @skip_take + def test_take_series(self): + pass + + @skip_take + def test_reindex_non_na_fill_value(self): + pass + @pytest.mark.xfail(reason="Categorical.take buggy") def test_take_empty(self): pass diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 5d749126e0cec..e9431bd0c233c 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -8,12 +8,12 @@ import pandas as pd from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.common import _ensure_platform_int class DecimalDtype(ExtensionDtype): type = decimal.Decimal name = 'decimal' + na_value = decimal.Decimal('NaN') @classmethod def construct_from_string(cls, string): @@ -28,6 +28,7 @@ class DecimalArray(ExtensionArray): dtype = DecimalDtype() def __init__(self, values): + assert all(isinstance(v, decimal.Decimal) for v in values) values = np.asarray(values, dtype=object) self._data = values @@ -52,6 +53,17 @@ def __getitem__(self, item): else: return type(self)(self._data[item]) + def take(self, indexer, allow_fill=False, fill_value=None): + from pandas.api.extensions import take + + data = self._data + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + result = take(data, indexer, fill_value=fill_value, + allow_fill=allow_fill) + return self._from_sequence(result) + def copy(self, deep=False): if deep: return type(self)(self._data.copy()) @@ -80,20 +92,6 @@ def nbytes(self): def isna(self): return np.array([x.is_nan() for x in self._data]) - def take(self, indexer, allow_fill=True, fill_value=None): - indexer = np.asarray(indexer) - mask = indexer == -1 - - # take on empty array not handled as desired by numpy in case of -1 - if not len(self) and mask.all(): - return type(self)([self._na_value] * len(indexer)) - - indexer = _ensure_platform_int(indexer) - out = self._data.take(indexer) - out[mask] = self._na_value - - return type(self)(out) - @property def _na_value(self): return decimal.Decimal('NaN') diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 53d74cd6d38cb..1f8cf0264f62f 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -108,7 +108,15 @@ class TestReshaping(BaseDecimal, base.BaseReshapingTests): class TestGetitem(BaseDecimal, base.BaseGetitemTests): - pass + + def test_take_na_value_other_decimal(self): + arr = DecimalArray([decimal.Decimal('1.0'), + decimal.Decimal('2.0')]) + result = arr.take([0, -1], allow_fill=True, + fill_value=decimal.Decimal('-1.0')) + expected = DecimalArray([decimal.Decimal('1.0'), + decimal.Decimal('-1.0')]) + self.assert_extension_array_equal(result, expected) class TestMissing(BaseDecimal, base.BaseMissingTests): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 2e75bb3b8c326..88bb66f38b35c 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -1,3 +1,15 @@ +"""Test extension array for storing nested data in a pandas container. + +The JSONArray stores lists of dictionaries. The storage mechanism is a list, +not an ndarray. + +Note: + +We currently store lists of UserDicts (Py3 only). Pandas has a few places +internally that specifically check for dicts, and does non-scalar things +in that case. We *want* the dictionaries to be treated as scalars, so we +hack around pandas by using UserDicts. +""" import collections import itertools import numbers @@ -14,6 +26,11 @@ class JSONDtype(ExtensionDtype): type = collections.Mapping name = 'json' + try: + na_value = collections.UserDict() + except AttributeError: + # source compatibility with Py2. + na_value = {} @classmethod def construct_from_string(cls, string): @@ -91,15 +108,33 @@ def nbytes(self): return sys.getsizeof(self.data) def isna(self): - return np.array([x == self._na_value for x in self.data]) - - def take(self, indexer, allow_fill=True, fill_value=None): - try: - output = [self.data[loc] if loc != -1 else self._na_value - for loc in indexer] - except IndexError: - raise IndexError("Index is out of bounds or cannot do a " - "non-empty take from an empty array.") + return np.array([x == self.dtype.na_value for x in self.data]) + + def take(self, indexer, allow_fill=False, fill_value=None): + # re-implement here, since NumPy has trouble setting + # sized objects like UserDicts into scalar slots of + # an ndarary. + indexer = np.asarray(indexer) + msg = ("Index is out of bounds or cannot do a " + "non-empty take from an empty array.") + + if allow_fill: + if fill_value is None: + fill_value = self.dtype.na_value + # bounds check + if (indexer < -1).any(): + raise ValueError + try: + output = [self.data[loc] if loc != -1 else fill_value + for loc in indexer] + except IndexError: + raise IndexError(msg) + else: + try: + output = [self.data[loc] for loc in indexer] + except IndexError: + raise IndexError(msg) + return self._from_sequence(output) def copy(self, deep=False): @@ -118,10 +153,6 @@ def unique(self): dict(x) for x in list(set(tuple(d.items()) for d in self.data)) ]) - @property - def _na_value(self): - return {} - @classmethod def _concat_same_type(cls, to_concat): data = list(itertools.chain.from_iterable([x.data for x in to_concat])) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 0ef34c3b0f679..1fdb7298eefc4 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -21,6 +21,17 @@ def dtype(): @pytest.fixture def data(): """Length-100 PeriodArray for semantics test.""" + data = make_data() + + # Why the while loop? NumPy is unable to construct an ndarray from + # equal-length ndarrays. Many of our operations involve coercing the + # EA to an ndarray of objects. To avoid random test failures, we ensure + # that our data is coercable to an ndarray. Several tests deal with only + # the first two elements, so that's what we'll check. + + while len(data[0]) == len(data[1]): + data = make_data() + return JSONArray(make_data()) @@ -41,8 +52,8 @@ def data_missing_for_sorting(): @pytest.fixture -def na_value(): - return {} +def na_value(dtype): + return dtype.na_value @pytest.fixture diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index c66310d10ebdc..04225454f61f9 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -16,7 +16,8 @@ import numpy as np import pandas as pd -from pandas.core.indexing import _non_reducing_slice, _maybe_numeric_slice +from pandas.core.indexing import (_non_reducing_slice, _maybe_numeric_slice, + validate_indices) from pandas import NaT, DataFrame, Index, Series, MultiIndex import pandas.util.testing as tm @@ -994,3 +995,27 @@ def test_none_coercion_mixed_dtypes(self): datetime(2000, 1, 3)], 'd': [None, 'b', 'c']}) tm.assert_frame_equal(start_dataframe, exp) + + +def test_validate_indices_ok(): + indices = np.asarray([0, 1]) + validate_indices(indices, 2) + validate_indices(indices[:0], 0) + validate_indices(np.array([-1, -1]), 0) + + +def test_validate_indices_low(): + indices = np.asarray([0, -2]) + with tm.assert_raises_regex(ValueError, "'indices' contains"): + validate_indices(indices, 2) + + +def test_validate_indices_high(): + indices = np.asarray([0, 1, 2]) + with tm.assert_raises_regex(IndexError, "indices are out"): + validate_indices(indices, 2) + + +def test_validate_indices_empty(): + with tm.assert_raises_regex(IndexError, "indices are out"): + validate_indices(np.array([0, 1]), 0) diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index 7b97b0e975df3..2b78c91f9dac5 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -3,6 +3,7 @@ from datetime import datetime import numpy as np +import pytest from pandas.compat import long import pandas.core.algorithms as algos import pandas.util.testing as tm @@ -445,3 +446,47 @@ def test_2d_datetime64(self): expected = arr.take(indexer, axis=1) expected[:, [2, 4]] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected) + + +class TestExtensionTake(object): + # The take method found in pd.api.extensions + + def test_bounds_check_large(self): + arr = np.array([1, 2]) + with pytest.raises(IndexError): + algos.take(arr, [2, 3], allow_fill=True) + + with pytest.raises(IndexError): + algos.take(arr, [2, 3], allow_fill=False) + + def test_bounds_check_small(self): + arr = np.array([1, 2, 3], dtype=np.int64) + indexer = [0, -1, -2] + with pytest.raises(ValueError): + algos.take(arr, indexer, allow_fill=True) + + result = algos.take(arr, indexer) + expected = np.array([1, 3, 2], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize('allow_fill', [True, False]) + def test_take_empty(self, allow_fill): + arr = np.array([], dtype=np.int64) + # empty take is ok + result = algos.take(arr, [], allow_fill=allow_fill) + tm.assert_numpy_array_equal(arr, result) + + with pytest.raises(IndexError): + algos.take(arr, [0], allow_fill=allow_fill) + + def test_take_na_empty(self): + result = algos.take(np.array([]), [-1, -1], allow_fill=True, + fill_value=0.0) + expected = np.array([0., 0.]) + tm.assert_numpy_array_equal(result, expected) + + def test_take_coerces_list(self): + arr = [1, 2, 3] + result = algos.take(arr, [0, 0]) + expected = np.array([1, 1]) + tm.assert_numpy_array_equal(result, expected)