From fb3c2343c88be05ff1db5ca0ddef274eb0fda6b3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Apr 2018 13:59:51 -0500 Subject: [PATCH 01/29] [WIP]: ExtensionArray.take default implementation Implements a take interface that's compatible with NumPy and optionally pandas' NA semantics. ```python In [1]: import pandas as pd In [2]: from pandas.tests.extension.decimal.array import * In [3]: arr = DecimalArray(['1.1', '1.2', '1.3']) In [4]: arr.take([0, 1, -1]) Out[4]: DecimalArray(array(['1.1', '1.2', '1.3'], dtype=object)) In [5]: arr.take([0, 1, -1], fill_value=float('nan')) Out[5]: DecimalArray(array(['1.1', '1.2', Decimal('NaN')], dtype=object)) ``` Closes https://github.com/pandas-dev/pandas/issues/20640 --- pandas/compat/__init__.py | 2 + pandas/core/algorithms.py | 78 +++++++++++++++- pandas/core/arrays/base.py | 91 +++++++++++-------- pandas/core/dtypes/base.py | 4 + pandas/core/dtypes/missing.py | 2 + pandas/tests/extension/base/getitem.py | 21 ++++- .../extension/category/test_categorical.py | 16 +++- pandas/tests/extension/decimal/array.py | 17 +--- 8 files changed, 174 insertions(+), 57 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 12517372fedd1..754af97baa847 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -451,3 +451,5 @@ def is_platform_mac(): def is_platform_32bit(): return struct.calcsize("P") * 8 < 64 + +_default_fill_value = object() diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 065a5782aced1..e487fe3018cbb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -30,6 +30,7 @@ _ensure_platform_int, _ensure_object, _ensure_float64, _ensure_uint64, _ensure_int64) +from pandas.compat import _default_fill_value from pandas.compat.numpy import _np_version_under1p10 from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -1482,7 +1483,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, # TODO(EA): Remove these if / elifs as datetimeTZ, interval, become EAs # dispatch to internal type takes if is_extension_array_dtype(arr): - return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + return arr.take(indexer, fill_value=fill_value) elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_interval_dtype(arr): @@ -1558,6 +1559,81 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, take_1d = take_nd +def take_ea(arr, indexer, fill_value=_default_fill_value): + """Extension-array compatible take. + + Parameters + ---------- + arr : array-like + Must satisify NumPy's take semantics. + indexer : sequence of integers + Indices to be taken. See Notes for how negative indicies + are handled. + fill_value : any, optional + Fill value to use for NA-indicies. This has a few behaviors. + + * fill_value is not specified : triggers NumPy's semantics + where negative values in `indexer` mean slices from the end. + * fill_value is NA : Fill positions where `indexer` is ``-1`` + with ``self.dtype.na_value``. Anything considered NA by + :func:`pandas.isna` will result in ``self.dtype.na_value`` + being used to fill. + * fill_value is not NA : Fill positions where `indexer` is ``-1`` + with `fill_value`. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indexer is out of bounds for the array. + ValueError + When the indexer contains negative values other than ``-1`` + and `fill_value` is specified. + + Notes + ----- + The meaning of negative values in `indexer` depends on the + `fill_value` argument. By default, we follow the behavior + :meth:`numpy.take` of where negative indices indicate slices + from the end. + + When `fill_value` is specified, we follow pandas semantics of ``-1`` + indicating a missing value. In this case, positions where `indexer` + is ``-1`` will be filled with `fill_value` or the default NA value + for this type. + + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when the indexer is a sequence of values. Additionally, + it's called by :meth:`Series.reindex` with a `fill_value`. + + See Also + -------- + numpy.take + """ + indexer = np.asarray(indexer) + if fill_value is _default_fill_value: + # NumPy style + result = arr.take(indexer) + else: + mask = indexer == -1 + if (indexer < -1).any(): + raise ValueError("Invalid value in 'indexer'. All values " + "must be non-negative or -1. When " + "'fill_value' is specified.") + + # take on empty array not handled as desired by numpy + # in case of -1 (all missing take) + if not len(arr) and mask.all(): + return arr._from_sequence([fill_value] * len(indexer)) + + result = arr.take(indexer) + result[mask] = fill_value + return result + + def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True): """ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9958be47267ee..300edfe5de99d 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -8,6 +8,7 @@ import numpy as np from pandas.errors import AbstractMethodError +from pandas.compat import _default_fill_value from pandas.compat.numpy import function as nv _not_implemented_message = "{} does not implement {}." @@ -53,6 +54,7 @@ class ExtensionArray(object): * unique * factorize / _values_for_factorize * argsort / _values_for_argsort + * take / _values_for_take This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise @@ -462,22 +464,38 @@ def factorize(self, na_sentinel=-1): # ------------------------------------------------------------------------ # Indexing methods # ------------------------------------------------------------------------ - def take(self, indexer, allow_fill=True, fill_value=None): - # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray + def _values_for_take(self): + """Values to use for `take`. + + Coerces to object dtype by default. + + Returns + ------- + array-like + Must satisify NumPy's `take` semantics. + """ + return self.astype(object) + + def take(self, indexer, fill_value=_default_fill_value): + # type: (Sequence[int], Optional[Any]) -> ExtensionArray """Take elements from an array. Parameters ---------- indexer : sequence of integers - indices to be taken. -1 is used to indicate values - that are missing. - allow_fill : bool, default True - If False, indexer is assumed to contain no -1 values so no filling - will be done. This short-circuits computation of a mask. Result is - undefined if allow_fill == False and -1 is present in indexer. - fill_value : any, default None - Fill value to replace -1 values with. If applicable, this should - use the sentinel missing value for this type. + Indices to be taken. See Notes for how negative indicies + are handled. + fill_value : any, optional + Fill value to use for NA-indicies. This has a few behaviors. + + * fill_value is not specified : triggers NumPy's semantics + where negative values in `indexer` mean slices from the end. + * fill_value is NA : Fill positions where `indexer` is ``-1`` + with ``self.dtype.na_value``. Anything considered NA by + :func:`pandas.isna` will result in ``self.dtype.na_value`` + being used to fill. + * fill_value is not NA : Fill positions where `indexer` is ``-1`` + with `fill_value`. Returns ------- @@ -487,44 +505,39 @@ def take(self, indexer, allow_fill=True, fill_value=None): ------ IndexError When the indexer is out of bounds for the array. + ValueError + When the indexer contains negative values other than ``-1`` + and `fill_value` is specified. Notes ----- - This should follow pandas' semantics where -1 indicates missing values. - Positions where indexer is ``-1`` should be filled with the missing - value for this type. - This gives rise to the special case of a take on an empty - ExtensionArray that does not raises an IndexError straight away - when the `indexer` is all ``-1``. - - This is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the - indexer is a sequence of values. + The meaning of negative values in `indexer` depends on the + `fill_value` argument. By default, we follow the behavior + :meth:`numpy.take` of where negative indices indicate slices + from the end. - Examples - -------- - Suppose the extension array is backed by a NumPy array stored as - ``self.data``. Then ``take`` may be written as - - .. code-block:: python - - def take(self, indexer, allow_fill=True, fill_value=None): - indexer = np.asarray(indexer) - mask = indexer == -1 - - # take on empty array not handled as desired by numpy - # in case of -1 (all missing take) - if not len(self) and mask.all(): - return type(self)([np.nan] * len(indexer)) + When `fill_value` is specified, we follow pandas semantics of ``-1`` + indicating a missing value. In this case, positions where `indexer` + is ``-1`` will be filled with `fill_value` or the default NA value + for this type. - result = self.data.take(indexer) - result[mask] = np.nan # NA for this type - return type(self)(result) + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when the indexer is a sequence of values. Additionally, + it's called by :meth:`Series.reindex` with a `fill_value`. See Also -------- numpy.take """ - raise AbstractMethodError(self) + from pandas.core.algorithms import take_ea + from pandas.core.missing import isna + + if isna(fill_value): + fill_value = self.dtype.na_value + + data = self._values_for_take() + result = take_ea(data, indexer, fill_value=fill_value) + return self._from_sequence(result) def copy(self, deep=False): # type: (bool) -> ExtensionArray diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 6dbed5f138d5d..2560a6a99ec6d 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -16,6 +16,10 @@ class _DtypeOpsMixin(object): # classes will inherit from this Mixin. Once everything is compatible, this # class's methods can be moved to ExtensionDtype and removed. + # na_value is the default NA value to use for this type. This is used in + # e.g. ExtensionArray.take. + na_value = np.nan + def __eq__(self, other): """Check whether 'other' is equal to self. diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 2c98cedd7d715..3b2336bf19547 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -502,6 +502,8 @@ def na_value_for_dtype(dtype, compat=True): """ dtype = pandas_dtype(dtype) + if is_extension_array_dtype(dtype): + return dtype.na_value if (is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype) or is_timedelta64_dtype(dtype) or is_period_dtype(dtype)): return NaT diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index ac156900671a6..796879d658be8 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -134,12 +134,29 @@ def test_take(self, data, na_value, na_cmp): def test_take_empty(self, data, na_value, na_cmp): empty = data[:0] - result = empty.take([-1]) - na_cmp(result[0], na_value) + # result = empty.take([-1]) + # na_cmp(result[0], na_value) with tm.assert_raises_regex(IndexError, "cannot do a non-empty take"): empty.take([0, 1]) + def test_take_negative(self, data): + # https://github.com/pandas-dev/pandas/issues/20640 + n = len(data) + result = data.take([0, -n, n - 1, -1]) + expected = data.take([0, 0, n - 1, n - 1]) + self.assert_extension_array_equal(result, expected) + + def test_take_non_na_fill_value(self, data_missing): + fill_value = data_missing[1] # valid + result = data_missing.take([-1, 1], fill_value=fill_value) + expected = data_missing.take([1, 1]) + self.assert_extension_array_equal(result, expected) + + def test_take_pandas_style_negative_raises(self, data, na_value): + with pytest.raises(ValueError): + data.take([0, -2], fill_value=na_value) + @pytest.mark.xfail(reason="Series.take with extension array buggy for -1") def test_take_series(self, data): s = pd.Series(data) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 6ebe700f13be0..b624dda382dae 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -81,6 +81,8 @@ def test_merge(self, data, na_value): class TestGetitem(base.BaseGetitemTests): + skip_take = pytest.mark.skip(reason="GH-20664.") + @pytest.mark.skip(reason="Backwards compatibility") def test_getitem_scalar(self): # CategoricalDtype.type isn't "correct" since it should @@ -88,11 +90,23 @@ def test_getitem_scalar(self): # to break things by changing. pass - @pytest.mark.xfail(reason="Categorical.take buggy") + @skip_take def test_take(self): # TODO remove this once Categorical.take is fixed pass + @skip_take + def test_take_negative(self): + pass + + @skip_take + def test_take_pandas_style_negative_raises(self): + pass + + @skip_take + def test_take_non_na_fill_value(self): + pass + @pytest.mark.xfail(reason="Categorical.take buggy") def test_take_empty(self): pass diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 5d749126e0cec..8aedf7d763833 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -8,12 +8,12 @@ import pandas as pd from pandas.core.arrays import ExtensionArray from pandas.core.dtypes.base import ExtensionDtype -from pandas.core.dtypes.common import _ensure_platform_int class DecimalDtype(ExtensionDtype): type = decimal.Decimal name = 'decimal' + na_value = decimal.Decimal('NaN') @classmethod def construct_from_string(cls, string): @@ -80,19 +80,8 @@ def nbytes(self): def isna(self): return np.array([x.is_nan() for x in self._data]) - def take(self, indexer, allow_fill=True, fill_value=None): - indexer = np.asarray(indexer) - mask = indexer == -1 - - # take on empty array not handled as desired by numpy in case of -1 - if not len(self) and mask.all(): - return type(self)([self._na_value] * len(indexer)) - - indexer = _ensure_platform_int(indexer) - out = self._data.take(indexer) - out[mask] = self._na_value - - return type(self)(out) + def _values_for_take(self): + return self.data @property def _na_value(self): From dacd98e5f8ed0947bb2646499ac2e3cfd94e6177 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Apr 2018 14:45:36 -0500 Subject: [PATCH 02/29] Moves --- pandas/compat/__init__.py | 2 - pandas/core/algorithms.py | 76 -------------------- pandas/core/arrays/base.py | 141 +++++++++++++++++++++++-------------- 3 files changed, 88 insertions(+), 131 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 754af97baa847..12517372fedd1 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -451,5 +451,3 @@ def is_platform_mac(): def is_platform_32bit(): return struct.calcsize("P") * 8 < 64 - -_default_fill_value = object() diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e487fe3018cbb..1df2ff4b021d7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -30,7 +30,6 @@ _ensure_platform_int, _ensure_object, _ensure_float64, _ensure_uint64, _ensure_int64) -from pandas.compat import _default_fill_value from pandas.compat.numpy import _np_version_under1p10 from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -1559,81 +1558,6 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, take_1d = take_nd -def take_ea(arr, indexer, fill_value=_default_fill_value): - """Extension-array compatible take. - - Parameters - ---------- - arr : array-like - Must satisify NumPy's take semantics. - indexer : sequence of integers - Indices to be taken. See Notes for how negative indicies - are handled. - fill_value : any, optional - Fill value to use for NA-indicies. This has a few behaviors. - - * fill_value is not specified : triggers NumPy's semantics - where negative values in `indexer` mean slices from the end. - * fill_value is NA : Fill positions where `indexer` is ``-1`` - with ``self.dtype.na_value``. Anything considered NA by - :func:`pandas.isna` will result in ``self.dtype.na_value`` - being used to fill. - * fill_value is not NA : Fill positions where `indexer` is ``-1`` - with `fill_value`. - - Returns - ------- - ExtensionArray - - Raises - ------ - IndexError - When the indexer is out of bounds for the array. - ValueError - When the indexer contains negative values other than ``-1`` - and `fill_value` is specified. - - Notes - ----- - The meaning of negative values in `indexer` depends on the - `fill_value` argument. By default, we follow the behavior - :meth:`numpy.take` of where negative indices indicate slices - from the end. - - When `fill_value` is specified, we follow pandas semantics of ``-1`` - indicating a missing value. In this case, positions where `indexer` - is ``-1`` will be filled with `fill_value` or the default NA value - for this type. - - ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, - ``iloc``, when the indexer is a sequence of values. Additionally, - it's called by :meth:`Series.reindex` with a `fill_value`. - - See Also - -------- - numpy.take - """ - indexer = np.asarray(indexer) - if fill_value is _default_fill_value: - # NumPy style - result = arr.take(indexer) - else: - mask = indexer == -1 - if (indexer < -1).any(): - raise ValueError("Invalid value in 'indexer'. All values " - "must be non-negative or -1. When " - "'fill_value' is specified.") - - # take on empty array not handled as desired by numpy - # in case of -1 (all missing take) - if not len(arr) and mask.all(): - return arr._from_sequence([fill_value] * len(indexer)) - - result = arr.take(indexer) - result[mask] = fill_value - return result - - def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True): """ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 300edfe5de99d..79750f5595b74 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -5,13 +5,70 @@ This is an experimental API and subject to breaking changes without warning. """ +import textwrap + import numpy as np from pandas.errors import AbstractMethodError -from pandas.compat import _default_fill_value from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender, Substitution _not_implemented_message = "{} does not implement {}." +_default_fill_value = object() + + +_take_docstring = textwrap.dedent("""\ +Take elements from an array. + +Parameters +---------- +%(arr)s\ +indexer : sequence of integers + Indices to be taken. See Notes for how negative indicies + are handled. +fill_value : any, optional + Fill value to use for NA-indicies. This has a few behaviors. + + * fill_value is not specified : triggers NumPy's semantics + where negative values in `indexer` mean slices from the end. + * fill_value is NA : Fill positions where `indexer` is ``-1`` + with ``self.dtype.na_value``. Anything considered NA by + :func:`pandas.isna` will result in ``self.dtype.na_value`` + being used to fill. + * fill_value is not NA : Fill positions where `indexer` is ``-1`` + with `fill_value`. + +Returns +------- +ExtensionArray + +Raises +------ +IndexError + When the indexer is out of bounds for the array. +ValueError + When the indexer contains negative values other than ``-1`` + and `fill_value` is specified. + +Notes +----- +The meaning of negative values in `indexer` depends on the +`fill_value` argument. By default, we follow the behavior +:meth:`numpy.take` of where negative indices indicate slices +from the end. + +When `fill_value` is specified, we follow pandas semantics of ``-1`` +indicating a missing value. In this case, positions where `indexer` +is ``-1`` will be filled with `fill_value` or the default NA value +for this type. + +ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, +``iloc``, when the indexer is a sequence of values. Additionally, +it's called by :meth:`Series.reindex` with a `fill_value`. + +See Also +-------- +numpy.take""") class ExtensionArray(object): @@ -476,60 +533,10 @@ def _values_for_take(self): """ return self.astype(object) + @Substitution(arr='') + @Appender(_take_docstring) def take(self, indexer, fill_value=_default_fill_value): # type: (Sequence[int], Optional[Any]) -> ExtensionArray - """Take elements from an array. - - Parameters - ---------- - indexer : sequence of integers - Indices to be taken. See Notes for how negative indicies - are handled. - fill_value : any, optional - Fill value to use for NA-indicies. This has a few behaviors. - - * fill_value is not specified : triggers NumPy's semantics - where negative values in `indexer` mean slices from the end. - * fill_value is NA : Fill positions where `indexer` is ``-1`` - with ``self.dtype.na_value``. Anything considered NA by - :func:`pandas.isna` will result in ``self.dtype.na_value`` - being used to fill. - * fill_value is not NA : Fill positions where `indexer` is ``-1`` - with `fill_value`. - - Returns - ------- - ExtensionArray - - Raises - ------ - IndexError - When the indexer is out of bounds for the array. - ValueError - When the indexer contains negative values other than ``-1`` - and `fill_value` is specified. - - Notes - ----- - The meaning of negative values in `indexer` depends on the - `fill_value` argument. By default, we follow the behavior - :meth:`numpy.take` of where negative indices indicate slices - from the end. - - When `fill_value` is specified, we follow pandas semantics of ``-1`` - indicating a missing value. In this case, positions where `indexer` - is ``-1`` will be filled with `fill_value` or the default NA value - for this type. - - ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, - ``iloc``, when the indexer is a sequence of values. Additionally, - it's called by :meth:`Series.reindex` with a `fill_value`. - - See Also - -------- - numpy.take - """ - from pandas.core.algorithms import take_ea from pandas.core.missing import isna if isna(fill_value): @@ -601,3 +608,31 @@ def _ndarray_values(self): used for interacting with our indexers. """ return np.array(self) + + +@Substitution(arr=textwrap.dedent("""\ +arr : array-like + Must satisfy NumPy's indexing sematnics, including `take` + and boolean masking. +""")) +@Appender(_take_docstring) +def take_ea(arr, indexer, fill_value=_default_fill_value): + indexer = np.asarray(indexer) + if fill_value is _default_fill_value: + # NumPy style + result = arr.take(indexer) + else: + mask = indexer == -1 + if (indexer < -1).any(): + raise ValueError("Invalid value in 'indexer'. All values " + "must be non-negative or -1. When " + "'fill_value' is specified.") + + # take on empty array not handled as desired by numpy + # in case of -1 (all missing take) + if not len(arr) and mask.all(): + return arr._from_sequence([fill_value] * len(indexer)) + + result = arr.take(indexer) + result[mask] = fill_value + return result From 0be9ec6593f38b698af6475c668ecf4c709bbe20 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 24 Apr 2018 18:02:13 -0500 Subject: [PATCH 03/29] non-internals changes --- pandas/compat/__init__.py | 3 ++ pandas/core/algorithms.py | 22 +++++++++++ pandas/core/arrays/base.py | 38 +++---------------- pandas/core/frame.py | 4 +- pandas/core/generic.py | 14 +++++-- pandas/core/series.py | 9 +++-- .../tests/extension/decimal/test_decimal.py | 20 +++++++++- 7 files changed, 67 insertions(+), 43 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 12517372fedd1..ffac335dea87a 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -451,3 +451,6 @@ def is_platform_mac(): def is_platform_32bit(): return struct.calcsize("P") * 8 < 64 + + +_default_fill_value = object() diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1df2ff4b021d7..4f67d1c429bd4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -30,6 +30,7 @@ _ensure_platform_int, _ensure_object, _ensure_float64, _ensure_uint64, _ensure_int64) +from pandas.compat import _default_fill_value from pandas.compat.numpy import _np_version_under1p10 from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -1441,6 +1442,27 @@ def func(arr, indexer, out, fill_value=np.nan): return func +def take(arr, indexer, fill_value=_default_fill_value): + indexer = np.asarray(indexer) + + if fill_value is _default_fill_value: + # NumPy style + result = arr.take(indexer) + else: + # bounds checking + if (indexer < -1).any(): + raise ValueError("Invalid value in 'indexer'. All values " + "must be non-negative or -1. When " + "'fill_value' is specified.") + + # # take on empty array not handled as desired by numpy + # # in case of -1 (all missing take) + # if not len(arr) and mask.all(): + # return arr._from_sequence([fill_value] * len(indexer)) + result = take_1d(arr, indexer, fill_value=fill_value) + return result + + def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True): """ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 79750f5595b74..94fded34cf0b0 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -10,11 +10,11 @@ import numpy as np from pandas.errors import AbstractMethodError +from pandas.compat import _default_fill_value from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution _not_implemented_message = "{} does not implement {}." -_default_fill_value = object() _take_docstring = textwrap.dedent("""\ @@ -537,13 +537,12 @@ def _values_for_take(self): @Appender(_take_docstring) def take(self, indexer, fill_value=_default_fill_value): # type: (Sequence[int], Optional[Any]) -> ExtensionArray - from pandas.core.missing import isna - - if isna(fill_value): - fill_value = self.dtype.na_value + if fill_value is np.nan: + import pdb; pdb.set_trace() + from pandas.core.algorithms import take data = self._values_for_take() - result = take_ea(data, indexer, fill_value=fill_value) + result = take(data, indexer, fill_value=fill_value) return self._from_sequence(result) def copy(self, deep=False): @@ -609,30 +608,3 @@ def _ndarray_values(self): """ return np.array(self) - -@Substitution(arr=textwrap.dedent("""\ -arr : array-like - Must satisfy NumPy's indexing sematnics, including `take` - and boolean masking. -""")) -@Appender(_take_docstring) -def take_ea(arr, indexer, fill_value=_default_fill_value): - indexer = np.asarray(indexer) - if fill_value is _default_fill_value: - # NumPy style - result = arr.take(indexer) - else: - mask = indexer == -1 - if (indexer < -1).any(): - raise ValueError("Invalid value in 'indexer'. All values " - "must be non-negative or -1. When " - "'fill_value' is specified.") - - # take on empty array not handled as desired by numpy - # in case of -1 (all missing take) - if not len(arr) and mask.all(): - return arr._from_sequence([fill_value] * len(indexer)) - - result = arr.take(indexer) - result[mask] = fill_value - return result diff --git a/pandas/core/frame.py b/pandas/core/frame.py index de6985ef3b4ea..f5920417a3899 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -77,7 +77,7 @@ from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.algorithms as algorithms from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u, - OrderedDict, raise_with_traceback) + OrderedDict, raise_with_traceback, _default_fill_value) from pandas import compat from pandas.compat import PY36 from pandas.compat.numpy import function as nv @@ -3504,7 +3504,7 @@ def _reindex_multi(self, axes, copy, fill_value): @Appender(_shared_docs['align'] % _shared_doc_kwargs) def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=None, method=None, limit=None, fill_axis=0, + fill_value=_default_fill_value, method=None, limit=None, fill_axis=0, broadcast_axis=None): return super(DataFrame, self).align(other, join=join, axis=axis, level=level, copy=copy, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 86342b6996abf..ba6cfddbc452b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -50,7 +50,8 @@ from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat import (map, zip, lzip, lrange, string_types, to_str, - isidentifier, set_function_name, cPickle as pkl) + isidentifier, set_function_name, cPickle as pkl, + _default_fill_value) from pandas.core.ops import _align_method_FRAME import pandas.core.nanops as nanops from pandas.util._decorators import (Appender, Substitution, @@ -3660,7 +3661,7 @@ def reindex(self, *args, **kwargs): copy = kwargs.pop('copy', True) limit = kwargs.pop('limit', None) tolerance = kwargs.pop('tolerance', None) - fill_value = kwargs.pop('fill_value', np.nan) + fill_value = kwargs.pop('fill_value', _default_fill_value) # Series.reindex doesn't use / need the axis kwarg # We pop and ignore it here, to make writing Series/Frame generic code @@ -3790,7 +3791,9 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, return self._reindex_with_indexers({axis: [new_index, indexer]}, fill_value=fill_value, copy=copy) - def _reindex_with_indexers(self, reindexers, fill_value=np.nan, copy=False, + def _reindex_with_indexers(self, reindexers, + fill_value=_default_fill_value, + copy=False, allow_dups=False): """allow_dups indicates an internal call here """ @@ -7209,7 +7212,7 @@ def ranker(data): @Appender(_shared_docs['align'] % _shared_doc_kwargs) def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=None, method=None, limit=None, fill_axis=0, + fill_value=_default_fill_value, method=None, limit=None, fill_axis=0, broadcast_axis=None): from pandas import DataFrame, Series method = missing.clean_fill_method(method) @@ -7359,6 +7362,9 @@ def _align_series(self, other, join='outer', axis=None, level=None, right = other.reindex(join_index, level=level) # fill + if fill_value is _default_fill_value: + fill_value = None + fill_na = notna(fill_value) or (method is not None) if fill_na: left = left.fillna(fill_value, method=method, limit=limit, diff --git a/pandas/core/series.py b/pandas/core/series.py index aa4cb510feb62..886567500ce71 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -56,7 +56,7 @@ from pandas import compat from pandas.io.formats.terminal import get_terminal_size from pandas.compat import ( - zip, u, OrderedDict, StringIO, range, get_range_parameters, PY36) + zip, u, OrderedDict, StringIO, range, get_range_parameters, PY36, _default_fill_value) from pandas.compat.numpy import function as nv import pandas.core.ops as ops @@ -3216,7 +3216,10 @@ def _reindex_indexer(self, new_index, indexer, copy): return self.copy() return self - new_values = algorithms.take_1d(self._values, indexer) + from pandas.core.dtypes.missing import na_value_for_dtype + fill_value = na_value_for_dtype(self.dtype) + new_values = algorithms.take(self._values, indexer, + fill_value=fill_value) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): @@ -3227,7 +3230,7 @@ def _needs_reindex_multi(self, axes, method, level): @Appender(generic._shared_docs['align'] % _shared_doc_kwargs) def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=None, method=None, limit=None, fill_axis=0, + fill_value=_default_fill_value, method=None, limit=None, fill_axis=0, broadcast_axis=None): return super(Series, self).align(other, join=join, axis=axis, level=level, copy=copy, diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 53d74cd6d38cb..7ca92dd057a61 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -108,7 +108,25 @@ class TestReshaping(BaseDecimal, base.BaseReshapingTests): class TestGetitem(BaseDecimal, base.BaseGetitemTests): - pass + + def test_take_basic(self): + ea = DecimalArray([decimal.Decimal('1'), + decimal.Decimal('2'), + decimal.Decimal('3')]) + result = ea.take([1, 2, -1]) + expected = DecimalArray([decimal.Decimal('2'), + decimal.Decimal('3'), + decimal.Decimal('3')]) + self.assert_extension_array_equal(result, expected) + + result = ea.take([1, 2, -1], fill_value=ea.dtype.na_value) + expected = DecimalArray([decimal.Decimal('2'), + decimal.Decimal('3'), + decimal.Decimal('NaN')]) + self.assert_extension_array_equal(result, expected) + + result = pd.Series(ea).reindex([1, 2, -1]).values + self.assert_extension_array_equal(result, expected) class TestMissing(BaseDecimal, base.BaseMissingTests): From 08f24790daa8bbade46dca86dd2fb9ff0353118c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 05:59:17 -0500 Subject: [PATCH 04/29] Fixup JSON take --- pandas/tests/extension/json/array.py | 38 ++++++++++++++++++---------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 95f868e89ac39..be0c904891a2c 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -14,6 +14,7 @@ class JSONDtype(ExtensionDtype): type = collections.Mapping name = 'json' + na_value = {} @classmethod def construct_from_string(cls, string): @@ -91,15 +92,30 @@ def nbytes(self): return sys.getsizeof(self.data) def isna(self): - return np.array([x == self._na_value for x in self.data]) - - def take(self, indexer, allow_fill=True, fill_value=None): - try: - output = [self.data[loc] if loc != -1 else self._na_value - for loc in indexer] - except IndexError: - raise IndexError("Index is out of bounds or cannot do a " - "non-empty take from an empty array.") + return np.array([x == self.dtype.na_value for x in self.data]) + + def take(self, indexer, fill_value=None): + # re-implement here, since NumPy has trouble setting + # sized objects like UserDicts into scalar slots of + # an ndarary. + indexer = np.asarray(indexer) + msg = ("Index is out of bounds or cannot do a " + "non-empty take from an empty array.") + + if fill_value is None: + try: + output = [self.data[loc] for loc in indexer] + except IndexError: + raise IndexError(msg) + else: + # bounds check + if (indexer < -1).any(): + raise ValueError + try: + output = [self.data[loc] if loc != -1 else fill_value + for loc in indexer] + except IndexError: + raise msg return self._from_sequence(output) def copy(self, deep=False): @@ -112,10 +128,6 @@ def unique(self): dict(x) for x in list(set(tuple(d.items()) for d in self.data)) ]) - @property - def _na_value(self): - return {} - @classmethod def _concat_same_type(cls, to_concat): data = list(itertools.chain.from_iterable([x.data for x in to_concat])) From eba137f8c7ae45288200566aebbf32229c69720c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 05:59:58 -0500 Subject: [PATCH 05/29] More internals hacking --- pandas/core/arrays/base.py | 3 +-- pandas/core/dtypes/base.py | 2 +- pandas/core/internals.py | 51 +++++++++++++++++++++++++++++++++++--- 3 files changed, 50 insertions(+), 6 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 94fded34cf0b0..9364ba7a228d8 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -537,8 +537,7 @@ def _values_for_take(self): @Appender(_take_docstring) def take(self, indexer, fill_value=_default_fill_value): # type: (Sequence[int], Optional[Any]) -> ExtensionArray - if fill_value is np.nan: - import pdb; pdb.set_trace() + # assert fill_value is not np.nan from pandas.core.algorithms import take data = self._values_for_take() diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 2560a6a99ec6d..a615bbff00194 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -18,7 +18,7 @@ class _DtypeOpsMixin(object): # na_value is the default NA value to use for this type. This is used in # e.g. ExtensionArray.take. - na_value = np.nan + na_value = np.nan # TODO: change to _na_value def __eq__(self, other): """Check whether 'other' is equal to self. diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e98899b2f5c1a..59813c2a24bfd 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1,4 +1,5 @@ import warnings + import copy from warnings import catch_warnings import inspect @@ -82,7 +83,7 @@ from pandas.util._decorators import cache_readonly from pandas.util._validators import validate_bool_kwarg from pandas import compat -from pandas.compat import range, map, zip, u +from pandas.compat import range, map, zip, u, _default_fill_value class Block(PandasObject): @@ -1888,6 +1889,10 @@ def _holder(self): # For extension blocks, the holder is values-dependent. return type(self.values) + @property + def fill_value(self): + return self.values.dtype.na_value # TODO: change to _na_value + @property def _can_hold_na(self): # The default ExtensionArray._can_hold_na is True @@ -4386,6 +4391,8 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, pandas-indexer with -1's only. """ + # TODO: see if we can make fill_value be {col -> fill_value} + # maybe earlier... if indexer is None: if new_axis is self.axes[axis] and not copy: return self @@ -4408,8 +4415,10 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, new_blocks = self._slice_take_blocks_ax0(indexer, fill_tuple=(fill_value,)) else: + if fill_value is None: + fill_value = _default_fill_value new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=( - fill_value if fill_value is not None else blk.fill_value,)) + fill_value if fill_value is not _default_fill_value else blk.fill_value,)) for blk in self.blocks] new_axes = list(self.axes) @@ -4436,6 +4445,9 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): if self._is_single_block: blk = self.blocks[0] + if allow_fill and fill_tuple[0] is _default_fill_value: + fill_tuple = (blk.fill_value,) + if sl_type in ('slice', 'mask'): return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] elif not allow_fill or self.ndim == 1: @@ -5404,6 +5416,25 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): elif is_uniform_join_units(join_units): b = join_units[0].block.concat_same_type( [ju.block for ju in join_units], placement=placement) + elif is_uniform_reindexer(join_units): + old_block = join_units[0].block + + new_values = concatenate_join_units(join_units, concat_axis, + copy=copy) + if new_values.ndim == 2: + # XXX: categorical returns a categorical here + # EA returns a 2d ndarray + # need to harmoinze these to always be EAs? + assert new_values.shape[0] == 1 + new_values = new_values[0] + + assert isinstance(old_block._holder, ABCExtensionArray) + + b = old_block.make_block_same_class( + old_block._holder._from_sequence(new_values), + placement=placement + ) + else: b = make_block( concatenate_join_units(join_units, concat_axis, copy=copy), @@ -5434,6 +5465,13 @@ def is_uniform_join_units(join_units): len(join_units) > 1) +def is_uniform_reindexer(join_units): + # For when we know we can reindex without changing type + return ( + all(ju.block and ju.block.is_extension for ju in join_units) + ) + + def get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. @@ -5461,12 +5499,15 @@ def get_empty_dtype_and_na(join_units): upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) + for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = 'category' + elif is_extension_array_dtype(dtype): + upcast_cls = 'extension' elif is_datetimetz(dtype): upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): @@ -5496,6 +5537,8 @@ def get_empty_dtype_and_na(join_units): # create the result if 'object' in upcast_classes: return np.dtype(np.object_), np.nan + elif 'extension' in upcast_classes: + return np.dtype(np.object_), None elif 'bool' in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan @@ -5755,7 +5798,9 @@ def dtype(self): if self.block is None: raise AssertionError("Block is None, no dtype") - if not self.needs_filling: + if not self.needs_filling or self.block.is_extension: + # ExtensionDtypes by definition can hold their + # NA value. return self.block.dtype else: return _get_dtype(maybe_promote(self.block.dtype, From 67ba9ddb0c8dec7f88a7ce14387898b552afe355 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 07:42:54 -0500 Subject: [PATCH 06/29] more with default fill value --- pandas/core/dtypes/cast.py | 4 +++- pandas/core/generic.py | 3 +++ pandas/core/internals.py | 12 +++++++++--- pandas/core/series.py | 2 +- pandas/core/sparse/frame.py | 4 ++-- 5 files changed, 18 insertions(+), 7 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 75434fcc2b40d..fc0a5375b5d75 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -7,7 +7,7 @@ from pandas._libs import tslib, lib from pandas._libs.tslib import iNaT -from pandas.compat import string_types, text_type, PY3 +from pandas.compat import string_types, text_type, PY3, _default_fill_value from .common import (_ensure_object, is_bool, is_integer, is_float, is_complex, is_datetimetz, is_categorical_dtype, is_datetimelike, @@ -255,6 +255,8 @@ def changeit(): def maybe_promote(dtype, fill_value=np.nan): + if fill_value is _default_fill_value: + fill_value = np.nan # if we passed an array here, determine the fill value by dtype if isinstance(fill_value, np.ndarray): diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba6cfddbc452b..ca53ad44a733a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7263,6 +7263,9 @@ def _align_frame(self, other, join='outer', axis=None, level=None, clidx, cridx = None, None is_series = isinstance(self, ABCSeries) + if fill_value is _default_fill_value: + # XXX: per-column? + fill_value = np.nan if axis is None or axis == 0: if not self.index.equals(other.index): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 59813c2a24bfd..0534308cb4857 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -4417,9 +4417,15 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, else: if fill_value is None: fill_value = _default_fill_value - new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=( - fill_value if fill_value is not _default_fill_value else blk.fill_value,)) - for blk in self.blocks] + + new_blocks = [] + for blk in self.blocks: + if fill_value is not _default_fill_value: + fill_tuple = (fill_value,) + else: + fill_tuple = (blk.fill_value,) + new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=fill_tuple) + for blk in self.blocks] new_axes = list(self.axes) new_axes[axis] = new_axis diff --git a/pandas/core/series.py b/pandas/core/series.py index 886567500ce71..348a77fd8d9cc 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3217,7 +3217,7 @@ def _reindex_indexer(self, new_index, indexer, copy): return self from pandas.core.dtypes.missing import na_value_for_dtype - fill_value = na_value_for_dtype(self.dtype) + fill_value = na_value_for_dtype(self.dtype, compat=False) new_values = algorithms.take(self._values, indexer, fill_value=fill_value) return self._constructor(new_values, index=new_index) diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index 2cefbea722098..b282450efb037 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -6,7 +6,7 @@ # pylint: disable=E1101,E1103,W0231,E0202 import warnings -from pandas.compat import lmap +from pandas.compat import lmap, _default_fill_value from pandas import compat import numpy as np @@ -690,7 +690,7 @@ def _reindex_columns(self, columns, method, copy, level, fill_value=None, if level is not None: raise TypeError('Reindex by level not supported for sparse') - if notna(fill_value): + if not (isna(fill_value) or fill_value is _default_fill_value): raise NotImplementedError("'fill_value' argument is not supported") if limit: From c721915114c5dd39cd7aec6c9dec762ca28abcac Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 07:50:54 -0500 Subject: [PATCH 07/29] Removed default_fill_value --- pandas/compat/__init__.py | 3 -- pandas/core/algorithms.py | 5 ++- pandas/core/arrays/base.py | 3 +- pandas/core/dtypes/cast.py | 5 +-- pandas/core/frame.py | 4 +-- pandas/core/generic.py | 17 +++-------- pandas/core/internals.py | 61 +++---------------------------------- pandas/core/series.py | 9 ++---- pandas/core/sparse/frame.py | 4 +-- 9 files changed, 20 insertions(+), 91 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index ffac335dea87a..12517372fedd1 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -451,6 +451,3 @@ def is_platform_mac(): def is_platform_32bit(): return struct.calcsize("P") * 8 < 64 - - -_default_fill_value = object() diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 3935d0829f1c9..d1fbaf91f3365 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -30,7 +30,6 @@ _ensure_platform_int, _ensure_object, _ensure_float64, _ensure_uint64, _ensure_int64) -from pandas.compat import _default_fill_value from pandas.compat.numpy import _np_version_under1p10 from pandas.core.dtypes.missing import isna, na_value_for_dtype @@ -1449,10 +1448,10 @@ def func(arr, indexer, out, fill_value=np.nan): return func -def take(arr, indexer, fill_value=_default_fill_value): +def take(arr, indexer, fill_value=None, allow_fill=None): indexer = np.asarray(indexer) - if fill_value is _default_fill_value: + if allow_fill is None: # NumPy style result = arr.take(indexer) else: diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 9364ba7a228d8..1942a848c57bf 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -10,7 +10,6 @@ import numpy as np from pandas.errors import AbstractMethodError -from pandas.compat import _default_fill_value from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution @@ -535,7 +534,7 @@ def _values_for_take(self): @Substitution(arr='') @Appender(_take_docstring) - def take(self, indexer, fill_value=_default_fill_value): + def take(self, indexer, fill_value=None, allow_fill=None): # type: (Sequence[int], Optional[Any]) -> ExtensionArray # assert fill_value is not np.nan from pandas.core.algorithms import take diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index fc0a5375b5d75..8deb777ca7d73 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -7,7 +7,7 @@ from pandas._libs import tslib, lib from pandas._libs.tslib import iNaT -from pandas.compat import string_types, text_type, PY3, _default_fill_value +from pandas.compat import string_types, text_type, PY3 from .common import (_ensure_object, is_bool, is_integer, is_float, is_complex, is_datetimetz, is_categorical_dtype, is_datetimelike, @@ -255,9 +255,6 @@ def changeit(): def maybe_promote(dtype, fill_value=np.nan): - if fill_value is _default_fill_value: - fill_value = np.nan - # if we passed an array here, determine the fill value by dtype if isinstance(fill_value, np.ndarray): if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f5920417a3899..de6985ef3b4ea 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -77,7 +77,7 @@ from pandas.core.arrays import Categorical, ExtensionArray import pandas.core.algorithms as algorithms from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u, - OrderedDict, raise_with_traceback, _default_fill_value) + OrderedDict, raise_with_traceback) from pandas import compat from pandas.compat import PY36 from pandas.compat.numpy import function as nv @@ -3504,7 +3504,7 @@ def _reindex_multi(self, axes, copy, fill_value): @Appender(_shared_docs['align'] % _shared_doc_kwargs) def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=_default_fill_value, method=None, limit=None, fill_axis=0, + fill_value=None, method=None, limit=None, fill_axis=0, broadcast_axis=None): return super(DataFrame, self).align(other, join=join, axis=axis, level=level, copy=copy, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ca53ad44a733a..86342b6996abf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -50,8 +50,7 @@ from pandas import compat from pandas.compat.numpy import function as nv from pandas.compat import (map, zip, lzip, lrange, string_types, to_str, - isidentifier, set_function_name, cPickle as pkl, - _default_fill_value) + isidentifier, set_function_name, cPickle as pkl) from pandas.core.ops import _align_method_FRAME import pandas.core.nanops as nanops from pandas.util._decorators import (Appender, Substitution, @@ -3661,7 +3660,7 @@ def reindex(self, *args, **kwargs): copy = kwargs.pop('copy', True) limit = kwargs.pop('limit', None) tolerance = kwargs.pop('tolerance', None) - fill_value = kwargs.pop('fill_value', _default_fill_value) + fill_value = kwargs.pop('fill_value', np.nan) # Series.reindex doesn't use / need the axis kwarg # We pop and ignore it here, to make writing Series/Frame generic code @@ -3791,9 +3790,7 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, return self._reindex_with_indexers({axis: [new_index, indexer]}, fill_value=fill_value, copy=copy) - def _reindex_with_indexers(self, reindexers, - fill_value=_default_fill_value, - copy=False, + def _reindex_with_indexers(self, reindexers, fill_value=np.nan, copy=False, allow_dups=False): """allow_dups indicates an internal call here """ @@ -7212,7 +7209,7 @@ def ranker(data): @Appender(_shared_docs['align'] % _shared_doc_kwargs) def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=_default_fill_value, method=None, limit=None, fill_axis=0, + fill_value=None, method=None, limit=None, fill_axis=0, broadcast_axis=None): from pandas import DataFrame, Series method = missing.clean_fill_method(method) @@ -7263,9 +7260,6 @@ def _align_frame(self, other, join='outer', axis=None, level=None, clidx, cridx = None, None is_series = isinstance(self, ABCSeries) - if fill_value is _default_fill_value: - # XXX: per-column? - fill_value = np.nan if axis is None or axis == 0: if not self.index.equals(other.index): @@ -7365,9 +7359,6 @@ def _align_series(self, other, join='outer', axis=None, level=None, right = other.reindex(join_index, level=level) # fill - if fill_value is _default_fill_value: - fill_value = None - fill_na = notna(fill_value) or (method is not None) if fill_na: left = left.fillna(fill_value, method=method, limit=limit, diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 0534308cb4857..e98899b2f5c1a 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1,5 +1,4 @@ import warnings - import copy from warnings import catch_warnings import inspect @@ -83,7 +82,7 @@ from pandas.util._decorators import cache_readonly from pandas.util._validators import validate_bool_kwarg from pandas import compat -from pandas.compat import range, map, zip, u, _default_fill_value +from pandas.compat import range, map, zip, u class Block(PandasObject): @@ -1889,10 +1888,6 @@ def _holder(self): # For extension blocks, the holder is values-dependent. return type(self.values) - @property - def fill_value(self): - return self.values.dtype.na_value # TODO: change to _na_value - @property def _can_hold_na(self): # The default ExtensionArray._can_hold_na is True @@ -4391,8 +4386,6 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, pandas-indexer with -1's only. """ - # TODO: see if we can make fill_value be {col -> fill_value} - # maybe earlier... if indexer is None: if new_axis is self.axes[axis] and not copy: return self @@ -4415,17 +4408,9 @@ def reindex_indexer(self, new_axis, indexer, axis, fill_value=None, new_blocks = self._slice_take_blocks_ax0(indexer, fill_tuple=(fill_value,)) else: - if fill_value is None: - fill_value = _default_fill_value - - new_blocks = [] - for blk in self.blocks: - if fill_value is not _default_fill_value: - fill_tuple = (fill_value,) - else: - fill_tuple = (blk.fill_value,) - new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=fill_tuple) - for blk in self.blocks] + new_blocks = [blk.take_nd(indexer, axis=axis, fill_tuple=( + fill_value if fill_value is not None else blk.fill_value,)) + for blk in self.blocks] new_axes = list(self.axes) new_axes[axis] = new_axis @@ -4451,9 +4436,6 @@ def _slice_take_blocks_ax0(self, slice_or_indexer, fill_tuple=None): if self._is_single_block: blk = self.blocks[0] - if allow_fill and fill_tuple[0] is _default_fill_value: - fill_tuple = (blk.fill_value,) - if sl_type in ('slice', 'mask'): return [blk.getitem_block(slobj, new_mgr_locs=slice(0, sllen))] elif not allow_fill or self.ndim == 1: @@ -5422,25 +5404,6 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): elif is_uniform_join_units(join_units): b = join_units[0].block.concat_same_type( [ju.block for ju in join_units], placement=placement) - elif is_uniform_reindexer(join_units): - old_block = join_units[0].block - - new_values = concatenate_join_units(join_units, concat_axis, - copy=copy) - if new_values.ndim == 2: - # XXX: categorical returns a categorical here - # EA returns a 2d ndarray - # need to harmoinze these to always be EAs? - assert new_values.shape[0] == 1 - new_values = new_values[0] - - assert isinstance(old_block._holder, ABCExtensionArray) - - b = old_block.make_block_same_class( - old_block._holder._from_sequence(new_values), - placement=placement - ) - else: b = make_block( concatenate_join_units(join_units, concat_axis, copy=copy), @@ -5471,13 +5434,6 @@ def is_uniform_join_units(join_units): len(join_units) > 1) -def is_uniform_reindexer(join_units): - # For when we know we can reindex without changing type - return ( - all(ju.block and ju.block.is_extension for ju in join_units) - ) - - def get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. @@ -5505,15 +5461,12 @@ def get_empty_dtype_and_na(join_units): upcast_classes = defaultdict(list) null_upcast_classes = defaultdict(list) - for dtype, unit in zip(dtypes, join_units): if dtype is None: continue if is_categorical_dtype(dtype): upcast_cls = 'category' - elif is_extension_array_dtype(dtype): - upcast_cls = 'extension' elif is_datetimetz(dtype): upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): @@ -5543,8 +5496,6 @@ def get_empty_dtype_and_na(join_units): # create the result if 'object' in upcast_classes: return np.dtype(np.object_), np.nan - elif 'extension' in upcast_classes: - return np.dtype(np.object_), None elif 'bool' in upcast_classes: if has_none_blocks: return np.dtype(np.object_), np.nan @@ -5804,9 +5755,7 @@ def dtype(self): if self.block is None: raise AssertionError("Block is None, no dtype") - if not self.needs_filling or self.block.is_extension: - # ExtensionDtypes by definition can hold their - # NA value. + if not self.needs_filling: return self.block.dtype else: return _get_dtype(maybe_promote(self.block.dtype, diff --git a/pandas/core/series.py b/pandas/core/series.py index 3e7e68bed9b41..f2ee225f50514 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -56,7 +56,7 @@ from pandas import compat from pandas.io.formats.terminal import get_terminal_size from pandas.compat import ( - zip, u, OrderedDict, StringIO, range, get_range_parameters, PY36, _default_fill_value) + zip, u, OrderedDict, StringIO, range, get_range_parameters, PY36) from pandas.compat.numpy import function as nv import pandas.core.ops as ops @@ -3216,10 +3216,7 @@ def _reindex_indexer(self, new_index, indexer, copy): return self.copy() return self - from pandas.core.dtypes.missing import na_value_for_dtype - fill_value = na_value_for_dtype(self.dtype, compat=False) - new_values = algorithms.take(self._values, indexer, - fill_value=fill_value) + new_values = algorithms.take_1d(self._values, indexer) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): @@ -3230,7 +3227,7 @@ def _needs_reindex_multi(self, axes, method, level): @Appender(generic._shared_docs['align'] % _shared_doc_kwargs) def align(self, other, join='outer', axis=None, level=None, copy=True, - fill_value=_default_fill_value, method=None, limit=None, fill_axis=0, + fill_value=None, method=None, limit=None, fill_axis=0, broadcast_axis=None): return super(Series, self).align(other, join=join, axis=axis, level=level, copy=copy, diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index b282450efb037..2cefbea722098 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -6,7 +6,7 @@ # pylint: disable=E1101,E1103,W0231,E0202 import warnings -from pandas.compat import lmap, _default_fill_value +from pandas.compat import lmap from pandas import compat import numpy as np @@ -690,7 +690,7 @@ def _reindex_columns(self, columns, method, copy, level, fill_value=None, if level is not None: raise TypeError('Reindex by level not supported for sparse') - if not (isna(fill_value) or fill_value is _default_fill_value): + if notna(fill_value): raise NotImplementedError("'fill_value' argument is not supported") if limit: From 125ca0b7cf5796b806e76c2c38d780e55ea12176 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 08:37:07 -0500 Subject: [PATCH 08/29] Simplify Upcasting is still broken --- pandas/core/algorithms.py | 2 +- pandas/core/arrays/base.py | 119 +++++++++--------- pandas/core/dtypes/cast.py | 6 +- pandas/core/frame.py | 2 +- pandas/core/generic.py | 9 +- pandas/core/internals.py | 8 +- pandas/core/series.py | 7 +- pandas/tests/extension/base/getitem.py | 2 +- pandas/tests/extension/decimal/array.py | 1 + .../tests/extension/decimal/test_decimal.py | 3 +- pandas/tests/extension/json/array.py | 6 +- 11 files changed, 88 insertions(+), 77 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d1fbaf91f3365..39342fc78db3b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1510,7 +1510,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, # TODO(EA): Remove these if / elifs as datetimeTZ, interval, become EAs # dispatch to internal type takes if is_extension_array_dtype(arr): - return arr.take(indexer, fill_value=fill_value) + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_datetimetz(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_interval_dtype(arr): diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 1942a848c57bf..bd65483904ae4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -5,71 +5,14 @@ This is an experimental API and subject to breaking changes without warning. """ -import textwrap - import numpy as np from pandas.errors import AbstractMethodError from pandas.compat.numpy import function as nv -from pandas.util._decorators import Appender, Substitution _not_implemented_message = "{} does not implement {}." -_take_docstring = textwrap.dedent("""\ -Take elements from an array. - -Parameters ----------- -%(arr)s\ -indexer : sequence of integers - Indices to be taken. See Notes for how negative indicies - are handled. -fill_value : any, optional - Fill value to use for NA-indicies. This has a few behaviors. - - * fill_value is not specified : triggers NumPy's semantics - where negative values in `indexer` mean slices from the end. - * fill_value is NA : Fill positions where `indexer` is ``-1`` - with ``self.dtype.na_value``. Anything considered NA by - :func:`pandas.isna` will result in ``self.dtype.na_value`` - being used to fill. - * fill_value is not NA : Fill positions where `indexer` is ``-1`` - with `fill_value`. - -Returns -------- -ExtensionArray - -Raises ------- -IndexError - When the indexer is out of bounds for the array. -ValueError - When the indexer contains negative values other than ``-1`` - and `fill_value` is specified. - -Notes ------ -The meaning of negative values in `indexer` depends on the -`fill_value` argument. By default, we follow the behavior -:meth:`numpy.take` of where negative indices indicate slices -from the end. - -When `fill_value` is specified, we follow pandas semantics of ``-1`` -indicating a missing value. In this case, positions where `indexer` -is ``-1`` will be filled with `fill_value` or the default NA value -for this type. - -ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, -``iloc``, when the indexer is a sequence of values. Additionally, -it's called by :meth:`Series.reindex` with a `fill_value`. - -See Also --------- -numpy.take""") - - class ExtensionArray(object): """Abstract base class for custom 1-D array types. @@ -532,15 +475,66 @@ def _values_for_take(self): """ return self.astype(object) - @Substitution(arr='') - @Appender(_take_docstring) def take(self, indexer, fill_value=None, allow_fill=None): - # type: (Sequence[int], Optional[Any]) -> ExtensionArray - # assert fill_value is not np.nan + # type: (Sequence[int], Optional[Any], Optional[bool]) -> ExtensionArray + """Take elements from an array. + + Parameters + ---------- + indexer : sequence of integers + Indices to be taken. See Notes for how negative indicies + are handled. + fill_value : any, optional + Fill value to use for NA-indicies. This has a few behaviors. + + * fill_value is not specified : triggers NumPy's semantics + where negative values in `indexer` mean slices from the end. + * fill_value is NA : Fill positions where `indexer` is ``-1`` + with ``self.dtype.na_value``. Anything considered NA by + :func:`pandas.isna` will result in ``self.dtype.na_value`` + being used to fill. + * fill_value is not NA : Fill positions where `indexer` is ``-1`` + with `fill_value`. + + Returns + ------- + ExtensionArray + + Raises + ------ + IndexError + When the indexer is out of bounds for the array. + ValueError + When the indexer contains negative values other than ``-1`` + and `fill_value` is specified. + + Notes + ----- + The meaning of negative values in `indexer` depends on the + `fill_value` argument. By default, we follow the behavior + :meth:`numpy.take` of where negative indices indicate slices + from the end. + + When `fill_value` is specified, we follow pandas semantics of ``-1`` + indicating a missing value. In this case, positions where `indexer` + is ``-1`` will be filled with `fill_value` or the default NA value + for this type. + + ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, + ``iloc``, when the indexer is a sequence of values. Additionally, + it's called by :meth:`Series.reindex` with a `fill_value`. + + See Also + -------- + numpy.take + """ from pandas.core.algorithms import take data = self._values_for_take() - result = take(data, indexer, fill_value=fill_value) + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill) return self._from_sequence(result) def copy(self, deep=False): @@ -605,4 +599,3 @@ def _ndarray_values(self): used for interacting with our indexers. """ return np.array(self) - diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 8deb777ca7d73..3d5c7fa6ab81e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -256,7 +256,11 @@ def changeit(): def maybe_promote(dtype, fill_value=np.nan): # if we passed an array here, determine the fill value by dtype - if isinstance(fill_value, np.ndarray): + if is_extension_array_dtype(dtype): + # XXX: verify this change + fill_value = dtype.na_value + + elif isinstance(fill_value, np.ndarray): if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): fill_value = iNaT else: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index de6985ef3b4ea..82d5a0286b117 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3476,7 +3476,7 @@ def _reindex_index(self, new_index, method, copy, level, fill_value=np.nan, allow_dups=False) def _reindex_columns(self, new_columns, method, copy, level, - fill_value=np.nan, limit=None, tolerance=None): + fill_value=None, limit=None, tolerance=None): new_columns, indexer = self.columns.reindex(new_columns, method=method, level=level, limit=limit, tolerance=tolerance) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 86342b6996abf..508fa14a325b3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3660,7 +3660,7 @@ def reindex(self, *args, **kwargs): copy = kwargs.pop('copy', True) limit = kwargs.pop('limit', None) tolerance = kwargs.pop('tolerance', None) - fill_value = kwargs.pop('fill_value', np.nan) + fill_value = kwargs.pop('fill_value', None) # Series.reindex doesn't use / need the axis kwarg # We pop and ignore it here, to make writing Series/Frame generic code @@ -3776,7 +3776,7 @@ def _reindex_multi(self, axes, copy, fill_value): @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs) def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, - limit=None, fill_value=np.nan): + limit=None, fill_value=None): msg = ("'.reindex_axis' is deprecated and will be removed in a future " "version. Use '.reindex' instead.") self._consolidate_inplace() @@ -3790,7 +3790,7 @@ def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, return self._reindex_with_indexers({axis: [new_index, indexer]}, fill_value=fill_value, copy=copy) - def _reindex_with_indexers(self, reindexers, fill_value=np.nan, copy=False, + def _reindex_with_indexers(self, reindexers, fill_value=None, copy=False, allow_dups=False): """allow_dups indicates an internal call here """ @@ -7252,7 +7252,7 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, raise TypeError('unsupported type: %s' % type(other)) def _align_frame(self, other, join='outer', axis=None, level=None, - copy=True, fill_value=np.nan, method=None, limit=None, + copy=True, fill_value=None, method=None, limit=None, fill_axis=0): # defaults join_index, join_columns = None, None @@ -7420,6 +7420,7 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, if other.ndim <= self.ndim: _, other = self.align(other, join='left', axis=axis, + # XXX level=level, fill_value=np.nan) # if we are NOT aligned, raise as we cannot where index diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e98899b2f5c1a..3e9219d4bc449 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1888,6 +1888,11 @@ def _holder(self): # For extension blocks, the holder is values-dependent. return type(self.values) + @property + def fill_value(self): + # Used in reindex_indexer + return self.values.dtype.na_value + @property def _can_hold_na(self): # The default ExtensionArray._can_hold_na is True @@ -1951,7 +1956,8 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None): # axis doesn't matter; we are really a single-dim object # but are passed the axis depending on the calling routing # if its REALLY axis 0, then this will be a reindex and not a take - new_values = self.values.take(indexer, fill_value=fill_value) + new_values = self.values.take(indexer, fill_value=fill_value, + allow_fill=True) # if we are a 1-dim object, then always place at 0 if self.ndim == 1: diff --git a/pandas/core/series.py b/pandas/core/series.py index f2ee225f50514..53ca38157dab2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2185,7 +2185,7 @@ def _binop(self, other, func, level=None, fill_value=None): result.name = None return result - def combine(self, other, func, fill_value=np.nan): + def combine(self, other, func, fill_value=None): """ Perform elementwise binary operation on two Series using given function with optional fill value when an index is missing from one Series or @@ -3216,7 +3216,10 @@ def _reindex_indexer(self, new_index, indexer, copy): return self.copy() return self - new_values = algorithms.take_1d(self._values, indexer) + # TODO: determine if we want EA to handle fill_value=None + # if not, then we have to determine this here. + new_values = algorithms.take_1d(self._values, indexer, + fill_value=None, allow_fill=True) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 796879d658be8..e4ff919146a7c 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -155,7 +155,7 @@ def test_take_non_na_fill_value(self, data_missing): def test_take_pandas_style_negative_raises(self, data, na_value): with pytest.raises(ValueError): - data.take([0, -2], fill_value=na_value) + data.take([0, -2], fill_value=na_value, allow_fill=True) @pytest.mark.xfail(reason="Series.take with extension array buggy for -1") def test_take_series(self, data): diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index 8aedf7d763833..c2375f57900aa 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -28,6 +28,7 @@ class DecimalArray(ExtensionArray): dtype = DecimalDtype() def __init__(self, values): + assert all(isinstance(v, decimal.Decimal) for v in values) values = np.asarray(values, dtype=object) self._data = values diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 7ca92dd057a61..1cb6c3b7dfe44 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -119,7 +119,8 @@ def test_take_basic(self): decimal.Decimal('3')]) self.assert_extension_array_equal(result, expected) - result = ea.take([1, 2, -1], fill_value=ea.dtype.na_value) + result = ea.take([1, 2, -1], fill_value=ea.dtype.na_value, + allow_fill=True) expected = DecimalArray([decimal.Decimal('2'), decimal.Decimal('3'), decimal.Decimal('NaN')]) diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index be0c904891a2c..c353cc9959765 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -94,7 +94,7 @@ def nbytes(self): def isna(self): return np.array([x == self.dtype.na_value for x in self.data]) - def take(self, indexer, fill_value=None): + def take(self, indexer, fill_value=None, allow_fill=None): # re-implement here, since NumPy has trouble setting # sized objects like UserDicts into scalar slots of # an ndarary. @@ -102,12 +102,14 @@ def take(self, indexer, fill_value=None): msg = ("Index is out of bounds or cannot do a " "non-empty take from an empty array.") - if fill_value is None: + if allow_fill is None: try: output = [self.data[loc] for loc in indexer] except IndexError: raise IndexError(msg) else: + if fill_value is None: + fill_value = self.dtype.na_value # bounds check if (indexer < -1).any(): raise ValueError From b7ae0bc2cadd821d97f21fc2eac99b321d9473e5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 09:06:59 -0500 Subject: [PATCH 09/29] revert combine change --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 53ca38157dab2..0c2eb40567c55 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2185,7 +2185,7 @@ def _binop(self, other, func, level=None, fill_value=None): result.name = None return result - def combine(self, other, func, fill_value=None): + def combine(self, other, func, fill_value=np.nan): """ Perform elementwise binary operation on two Series using given function with optional fill value when an index is missing from one Series or From 338566faf627e1fabd638a65055f1804948dd021 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 09:42:28 -0500 Subject: [PATCH 10/29] Upcasting --- pandas/core/internals.py | 16 ++++++++++++++++ pandas/tests/extension/base/reshaping.py | 10 +++++----- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 3e9219d4bc449..162d60b7daa5e 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5399,6 +5399,9 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): for placement, join_units in concat_plan: + # The issue: we have a join unit (or maybe several) that needs to be + # reindexed. + if len(join_units) == 1 and not join_units[0].indexers: b = join_units[0].block values = b.values @@ -5440,6 +5443,13 @@ def is_uniform_join_units(join_units): len(join_units) > 1) +def is_uniform_reindex(join_units): + return ( + # TODO: should this be ju.block.can_hold_na? + all(ju.block and ju.block.is_extension for ju in join_units) and + len(set(ju.block.dtype.name for ju in join_units)) == 1 + ) + def get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. @@ -5457,6 +5467,12 @@ def get_empty_dtype_and_na(join_units): if blk is None: return np.float64, np.nan + if is_uniform_reindex(join_units): + # XXX: integrate property + empty_dtype = join_units[0].block.dtype + upcasted_na = join_units[0].block.fill_value + return empty_dtype, upcasted_na + has_none_blocks = False dtypes = [None] * len(join_units) for i, unit in enumerate(join_units): diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index cc78321dea7af..a559cbdafc1fe 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -124,11 +124,11 @@ def test_merge(self, data, na_value): 'key': [0, 1, 2]}) df2 = pd.DataFrame({'int2': [1, 2, 3, 4], 'key': [0, 0, 1, 3]}) - res = pd.merge(df1, df2) - exp = pd.DataFrame( - {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1], - 'ext': data._from_sequence([data[0], data[0], data[1]])}) - self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + # res = pd.merge(df1, df2) + # exp = pd.DataFrame( + # {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1], + # 'ext': data._from_sequence([data[0], data[0], data[1]])}) + # self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) res = pd.merge(df1, df2, how='outer') exp = pd.DataFrame( From 31cd304e2f61fad841f6d56135fd6bbb202c606f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 09:43:45 -0500 Subject: [PATCH 11/29] pep8 --- pandas/core/arrays/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index bd65483904ae4..c99c37bf785db 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -476,7 +476,7 @@ def _values_for_take(self): return self.astype(object) def take(self, indexer, fill_value=None, allow_fill=None): - # type: (Sequence[int], Optional[Any], Optional[bool]) -> ExtensionArray + # type: (Sequence[int], Optional[Any], bool) -> ExtensionArray """Take elements from an array. Parameters @@ -534,7 +534,8 @@ def take(self, indexer, fill_value=None, allow_fill=None): if allow_fill and fill_value is None: fill_value = self.dtype.na_value - result = take(data, indexer, fill_value=fill_value, allow_fill=allow_fill) + result = take(data, indexer, fill_value=fill_value, + allow_fill=allow_fill) return self._from_sequence(result) def copy(self, deep=False): From 05d884498a1b15f876e5166bc5208afd804b63c7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 09:59:33 -0500 Subject: [PATCH 12/29] Updated docs --- pandas/core/arrays/base.py | 32 +++++++++++----------------- pandas/tests/extension/json/array.py | 13 +++++------ 2 files changed, 19 insertions(+), 26 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index c99c37bf785db..28e7e4beada64 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -485,16 +485,18 @@ def take(self, indexer, fill_value=None, allow_fill=None): Indices to be taken. See Notes for how negative indicies are handled. fill_value : any, optional - Fill value to use for NA-indicies. This has a few behaviors. + Fill value to use for NA-indicies when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + allow_fill : bool, optional + How to handle negative values in `indexer`. - * fill_value is not specified : triggers NumPy's semantics - where negative values in `indexer` mean slices from the end. - * fill_value is NA : Fill positions where `indexer` is ``-1`` - with ``self.dtype.na_value``. Anything considered NA by - :func:`pandas.isna` will result in ``self.dtype.na_value`` - being used to fill. - * fill_value is not NA : Fill positions where `indexer` is ``-1`` - with `fill_value`. + For False values (the default), NumPy's behavior is used. Negative + values in `indexer` mean slices from the right. + + For True values, Pandas behavior is used. Indicies where `indexer` + is ``-1`` are set to `fill_value`. Any other negative value should + raise a ``ValueError``. Returns ------- @@ -506,20 +508,10 @@ def take(self, indexer, fill_value=None, allow_fill=None): When the indexer is out of bounds for the array. ValueError When the indexer contains negative values other than ``-1`` - and `fill_value` is specified. + and `allow_fill` is True. Notes ----- - The meaning of negative values in `indexer` depends on the - `fill_value` argument. By default, we follow the behavior - :meth:`numpy.take` of where negative indices indicate slices - from the end. - - When `fill_value` is specified, we follow pandas semantics of ``-1`` - indicating a missing value. In this case, positions where `indexer` - is ``-1`` will be filled with `fill_value` or the default NA value - for this type. - ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the indexer is a sequence of values. Additionally, it's called by :meth:`Series.reindex` with a `fill_value`. diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index c353cc9959765..cfe8847427e49 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -102,12 +102,7 @@ def take(self, indexer, fill_value=None, allow_fill=None): msg = ("Index is out of bounds or cannot do a " "non-empty take from an empty array.") - if allow_fill is None: - try: - output = [self.data[loc] for loc in indexer] - except IndexError: - raise IndexError(msg) - else: + if allow_fill: if fill_value is None: fill_value = self.dtype.na_value # bounds check @@ -118,6 +113,12 @@ def take(self, indexer, fill_value=None, allow_fill=None): for loc in indexer] except IndexError: raise msg + else: + try: + output = [self.data[loc] for loc in indexer] + except IndexError: + raise IndexError(msg) + return self._from_sequence(output) def copy(self, deep=False): From 69e7fe76b540807bb973496ee5a2e3e636b90287 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 12:40:20 -0500 Subject: [PATCH 13/29] Updates 1. Reversed order of take keywords 2. Added to extensions API 3. Removed default implementation --- pandas/api/extensions/__init__.py | 1 + pandas/core/algorithms.py | 53 +++++++- pandas/core/arrays/base.py | 116 +++++++++--------- pandas/core/dtypes/base.py | 5 +- pandas/core/dtypes/cast.py | 8 +- pandas/core/internals.py | 2 +- pandas/tests/extension/base/getitem.py | 7 +- pandas/tests/extension/base/reshaping.py | 10 +- pandas/tests/extension/decimal/array.py | 14 ++- .../tests/extension/decimal/test_decimal.py | 21 +--- pandas/tests/extension/json/array.py | 10 +- pandas/tests/extension/json/test_json.py | 4 +- 12 files changed, 147 insertions(+), 104 deletions(-) diff --git a/pandas/api/extensions/__init__.py b/pandas/api/extensions/__init__.py index cc09204d992d8..3e6e192a3502c 100644 --- a/pandas/api/extensions/__init__.py +++ b/pandas/api/extensions/__init__.py @@ -2,5 +2,6 @@ from pandas.core.accessor import (register_dataframe_accessor, # noqa register_index_accessor, register_series_accessor) +from pandas.core.algorithms import take # noqa from pandas.core.arrays.base import ExtensionArray # noqa from pandas.core.dtypes.dtypes import ExtensionDtype # noqa diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 39342fc78db3b..7b1a22c12b6b5 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1448,24 +1448,67 @@ def func(arr, indexer, out, fill_value=np.nan): return func -def take(arr, indexer, fill_value=None, allow_fill=None): +def take(arr, indexer, allow_fill=False, fill_value=None): + """Take elements from an array. + + Parameters + ---------- + arr : ndarray or ExtensionArray + indexer : sequence of integers + Indices to be taken. See Notes for how negative indicies + are handled. + allow_fill : bool, default False + How to handle negative values in `indexer`. + + For False values (the default), negative values in `indexer` + indiciate slices from the right. + + For True values, indicies where `indexer` is ``-1`` indicate + missing values. These values are set to `fill_value`. Any other + other negative value should raise a ``ValueError``. + fill_value : any, optional + Fill value to use for NA-indicies when `allow_fill` is True. + This may be ``None``, in which case the default NA value for + the type, ``self.dtype.na_value``, is used. + + Returns + ------- + ndarray or ExtensionArray + Same type as the input. + + Raises + ------ + IndexError + When the indexer is out of bounds for the array. + ValueError + When the indexer contains negative values other than ``-1`` + and `allow_fill` is True. + + See Also + -------- + numpy.take + """ indexer = np.asarray(indexer) - if allow_fill is None: - # NumPy style - result = arr.take(indexer) - else: + if allow_fill: + # Pandas style, -1 means NA # bounds checking if (indexer < -1).any(): raise ValueError("Invalid value in 'indexer'. All values " "must be non-negative or -1. When " "'fill_value' is specified.") + if (indexer > len(arr)).any(): + # TODO: reuse with logic elsewhere. + raise IndexError # # take on empty array not handled as desired by numpy # # in case of -1 (all missing take) # if not len(arr) and mask.all(): # return arr._from_sequence([fill_value] * len(indexer)) result = take_1d(arr, indexer, fill_value=fill_value) + else: + # NumPy style + result = arr.take(indexer) return result diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 28e7e4beada64..813f0be538ed9 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -53,7 +53,6 @@ class ExtensionArray(object): * unique * factorize / _values_for_factorize * argsort / _values_for_argsort - * take / _values_for_take This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise @@ -277,22 +276,23 @@ def isna(self): """ raise AbstractMethodError(self) - def _values_for_argsort(self): - # type: () -> ndarray - """Return values for sorting. + def _values_for_factorize(self): + # type: () -> Tuple[ndarray, Any] + """Return an array and missing value suitable for factorization. Returns ------- - ndarray - The transformed values should maintain the ordering between values - within the array. - - See Also - -------- - ExtensionArray.argsort + values : ndarray + An array suitable for factoraization. This should maintain order + and be a supported dtype (Float64, Int64, UInt64, String, Object). + By default, the extension array is cast to object dtype. + na_value : object + The value in `values` to consider missing. This will be treated + as NA in the factorization routines, so it will be coded as + `na_sentinal` and not included in `uniques`. By default, + ``np.nan`` is used. """ - # Note: this is used in `ExtensionArray.argsort`. - return np.array(self) + return self.astype(object), np.nan def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ @@ -393,24 +393,6 @@ def unique(self): uniques = unique(self.astype(object)) return self._from_sequence(uniques) - def _values_for_factorize(self): - # type: () -> Tuple[ndarray, Any] - """Return an array and missing value suitable for factorization. - - Returns - ------- - values : ndarray - An array suitable for factoraization. This should maintain order - and be a supported dtype (Float64, Int64, UInt64, String, Object). - By default, the extension array is cast to object dtype. - na_value : object - The value in `values` to consider missing. This will be treated - as NA in the factorization routines, so it will be coded as - `na_sentinal` and not included in `uniques`. By default, - ``np.nan`` is used. - """ - return self.astype(object), np.nan - def factorize(self, na_sentinel=-1): # type: (int) -> Tuple[ndarray, ExtensionArray] """Encode the extension array as an enumerated type. @@ -463,20 +445,25 @@ def factorize(self, na_sentinel=-1): # ------------------------------------------------------------------------ # Indexing methods # ------------------------------------------------------------------------ - def _values_for_take(self): - """Values to use for `take`. - - Coerces to object dtype by default. + def _values_for_argsort(self): + # type: () -> ndarray + """Return values for sorting. Returns ------- - array-like - Must satisify NumPy's `take` semantics. + ndarray + The transformed values should maintain the ordering between values + within the array. + + See Also + -------- + ExtensionArray.argsort """ - return self.astype(object) + # Note: this is used in `ExtensionArray.argsort`. + return np.array(self) - def take(self, indexer, fill_value=None, allow_fill=None): - # type: (Sequence[int], Optional[Any], bool) -> ExtensionArray + def take(self, indexer, allow_fill=False, fill_value=None): + # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray """Take elements from an array. Parameters @@ -484,19 +471,19 @@ def take(self, indexer, fill_value=None, allow_fill=None): indexer : sequence of integers Indices to be taken. See Notes for how negative indicies are handled. + allow_fill : bool, default False + How to handle negative values in `indexer`. + + For False values (the default), negative values in `indexer` + indiciate slices from the right. + + For True values, indicies where `indexer` is ``-1`` indicate + missing values. These values are set to `fill_value`. Any other + other negative value should raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indicies when `allow_fill` is True. This may be ``None``, in which case the default NA value for the type, ``self.dtype.na_value``, is used. - allow_fill : bool, optional - How to handle negative values in `indexer`. - - For False values (the default), NumPy's behavior is used. Negative - values in `indexer` mean slices from the right. - - For True values, Pandas behavior is used. Indicies where `indexer` - is ``-1`` are set to `fill_value`. Any other negative value should - raise a ``ValueError``. Returns ------- @@ -514,21 +501,34 @@ def take(self, indexer, fill_value=None, allow_fill=None): ----- ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when the indexer is a sequence of values. Additionally, - it's called by :meth:`Series.reindex` with a `fill_value`. + it's called by :meth:`Series.reindex`, or any other method + that causes realignemnt, with a `fill_value`. See Also -------- numpy.take - """ - from pandas.core.algorithms import take + pandas.api.extensions.take + + Examples + -------- + Here's an example implementation, which relies on casting the + extension array to object dtype. This uses the helper method + :func:`pandas.api.extensions.take`. - data = self._values_for_take() - if allow_fill and fill_value is None: - fill_value = self.dtype.na_value + .. code-block:: python - result = take(data, indexer, fill_value=fill_value, - allow_fill=allow_fill) - return self._from_sequence(result) + def take(self, indexer, allow_fill=False, fill_value=None): + from pandas.core.algorithms import take + + data = self.astype(object) + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + result = take(data, indexer, fill_value=fill_value, + allow_fill=allow_fill) + return self._from_sequence(result) + """ + raise AbstractMethodError(self) def copy(self, deep=False): # type: (bool) -> ExtensionArray diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index a615bbff00194..f6311bdf04f08 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -18,7 +18,7 @@ class _DtypeOpsMixin(object): # na_value is the default NA value to use for this type. This is used in # e.g. ExtensionArray.take. - na_value = np.nan # TODO: change to _na_value + na_value = np.nan def __eq__(self, other): """Check whether 'other' is equal to self. @@ -105,6 +105,9 @@ class ExtensionDtype(_DtypeOpsMixin): * name * construct_from_string + The `na_value` class attribute can be used to set the default NA value + for this type. :attr:`numpy.nan` is used by default. + This class does not inherit from 'abc.ABCMeta' for performance reasons. Methods and properties required by the interface raise ``pandas.errors.AbstractMethodError`` and no ``register`` method is diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 3d5c7fa6ab81e..e4ed6d544d42e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -256,11 +256,7 @@ def changeit(): def maybe_promote(dtype, fill_value=np.nan): # if we passed an array here, determine the fill value by dtype - if is_extension_array_dtype(dtype): - # XXX: verify this change - fill_value = dtype.na_value - - elif isinstance(fill_value, np.ndarray): + if isinstance(fill_value, np.ndarray): if issubclass(fill_value.dtype.type, (np.datetime64, np.timedelta64)): fill_value = iNaT else: @@ -297,6 +293,8 @@ def maybe_promote(dtype, fill_value=np.nan): elif is_datetimetz(dtype): if isna(fill_value): fill_value = iNaT + elif is_extension_array_dtype(dtype) and isna(fill_value): + fill_value = dtype.na_value elif is_float(fill_value): if issubclass(dtype.type, np.bool_): dtype = np.object_ diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 162d60b7daa5e..eff242bf11ecb 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5445,7 +5445,7 @@ def is_uniform_join_units(join_units): def is_uniform_reindex(join_units): return ( - # TODO: should this be ju.block.can_hold_na? + # TODO: should this be ju.block._can_hold_na? all(ju.block and ju.block.is_extension for ju in join_units) and len(set(ju.block.dtype.name for ju in join_units)) == 1 ) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index e4ff919146a7c..b5c87d1a858d8 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -149,8 +149,11 @@ def test_take_negative(self, data): def test_take_non_na_fill_value(self, data_missing): fill_value = data_missing[1] # valid - result = data_missing.take([-1, 1], fill_value=fill_value) - expected = data_missing.take([1, 1]) + na = data_missing[0] + + array = data_missing._from_sequence([na, fill_value, na]) + result = array.take([-1, 1], fill_value=fill_value, allow_fill=True) + expected = array.take([1, 1]) self.assert_extension_array_equal(result, expected) def test_take_pandas_style_negative_raises(self, data, na_value): diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index a559cbdafc1fe..cc78321dea7af 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -124,11 +124,11 @@ def test_merge(self, data, na_value): 'key': [0, 1, 2]}) df2 = pd.DataFrame({'int2': [1, 2, 3, 4], 'key': [0, 0, 1, 3]}) - # res = pd.merge(df1, df2) - # exp = pd.DataFrame( - # {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1], - # 'ext': data._from_sequence([data[0], data[0], data[1]])}) - # self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) + res = pd.merge(df1, df2) + exp = pd.DataFrame( + {'int1': [1, 1, 2], 'int2': [1, 2, 3], 'key': [0, 0, 1], + 'ext': data._from_sequence([data[0], data[0], data[1]])}) + self.assert_frame_equal(res, exp[['ext', 'int1', 'key', 'int2']]) res = pd.merge(df1, df2, how='outer') exp = pd.DataFrame( diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py index c2375f57900aa..e9431bd0c233c 100644 --- a/pandas/tests/extension/decimal/array.py +++ b/pandas/tests/extension/decimal/array.py @@ -53,6 +53,17 @@ def __getitem__(self, item): else: return type(self)(self._data[item]) + def take(self, indexer, allow_fill=False, fill_value=None): + from pandas.api.extensions import take + + data = self._data + if allow_fill and fill_value is None: + fill_value = self.dtype.na_value + + result = take(data, indexer, fill_value=fill_value, + allow_fill=allow_fill) + return self._from_sequence(result) + def copy(self, deep=False): if deep: return type(self)(self._data.copy()) @@ -81,9 +92,6 @@ def nbytes(self): def isna(self): return np.array([x.is_nan() for x in self._data]) - def _values_for_take(self): - return self.data - @property def _na_value(self): return decimal.Decimal('NaN') diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 1cb6c3b7dfe44..53d74cd6d38cb 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -108,26 +108,7 @@ class TestReshaping(BaseDecimal, base.BaseReshapingTests): class TestGetitem(BaseDecimal, base.BaseGetitemTests): - - def test_take_basic(self): - ea = DecimalArray([decimal.Decimal('1'), - decimal.Decimal('2'), - decimal.Decimal('3')]) - result = ea.take([1, 2, -1]) - expected = DecimalArray([decimal.Decimal('2'), - decimal.Decimal('3'), - decimal.Decimal('3')]) - self.assert_extension_array_equal(result, expected) - - result = ea.take([1, 2, -1], fill_value=ea.dtype.na_value, - allow_fill=True) - expected = DecimalArray([decimal.Decimal('2'), - decimal.Decimal('3'), - decimal.Decimal('NaN')]) - self.assert_extension_array_equal(result, expected) - - result = pd.Series(ea).reindex([1, 2, -1]).values - self.assert_extension_array_equal(result, expected) + pass class TestMissing(BaseDecimal, base.BaseMissingTests): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index cfe8847427e49..2be57643b4c99 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -14,7 +14,7 @@ class JSONDtype(ExtensionDtype): type = collections.Mapping name = 'json' - na_value = {} + na_value = collections.UserDict() @classmethod def construct_from_string(cls, string): @@ -94,7 +94,7 @@ def nbytes(self): def isna(self): return np.array([x == self.dtype.na_value for x in self.data]) - def take(self, indexer, fill_value=None, allow_fill=None): + def take(self, indexer, allow_fill=False, fill_value=None): # re-implement here, since NumPy has trouble setting # sized objects like UserDicts into scalar slots of # an ndarary. @@ -121,6 +121,12 @@ def take(self, indexer, fill_value=None, allow_fill=None): return self._from_sequence(output) + # def astype(self, dtype, copy=True): + # # NumPy has issues when all the dicts are the same length. + # # np.array([UserDict(...), UserDict(...)]) fails, + # # but np.array([{...}, {...}]) works, so cast. + # return np.array([dict(x) for x in self], dtype=dtype, copy=copy) + def copy(self, deep=False): return type(self)(self.data[:]) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index dcf08440738e7..cc6f69d13b56c 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -39,8 +39,8 @@ def data_missing_for_sorting(): @pytest.fixture -def na_value(): - return {} +def na_value(dtype): + return dtype.na_value @pytest.fixture From 449983b3686710886da78e2c2b77ce59be9a93d3 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 12:55:31 -0500 Subject: [PATCH 14/29] Linting --- pandas/core/internals.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index eff242bf11ecb..e9309873fe95c 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5450,6 +5450,7 @@ def is_uniform_reindex(join_units): len(set(ju.block.dtype.name for ju in join_units)) == 1 ) + def get_empty_dtype_and_na(join_units): """ Return dtype and N/A values to use when concatenating specified units. From c449afd2208451d67ce9c71c4b9d94448665c534 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 14:48:33 -0500 Subject: [PATCH 15/29] Bounds checking --- pandas/core/algorithms.py | 22 ++++------ pandas/core/indexing.py | 41 +++++++++++++++++++ pandas/tests/extension/base/getitem.py | 14 ++++++- .../extension/category/test_categorical.py | 4 ++ pandas/tests/extension/json/array.py | 8 +++- pandas/tests/indexing/test_indexing.py | 27 +++++++++++- pandas/tests/test_algos.py | 36 ++++++++++++++++ 7 files changed, 132 insertions(+), 20 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7b1a22c12b6b5..6cb0763ffe9dc 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1488,24 +1488,16 @@ def take(arr, indexer, allow_fill=False, fill_value=None): -------- numpy.take """ - indexer = np.asarray(indexer) + from pandas.core.indexing import validate_indices + + # Do we require int64 here? + indexer = np.asarray(indexer, dtype='int') if allow_fill: # Pandas style, -1 means NA - # bounds checking - if (indexer < -1).any(): - raise ValueError("Invalid value in 'indexer'. All values " - "must be non-negative or -1. When " - "'fill_value' is specified.") - if (indexer > len(arr)).any(): - # TODO: reuse with logic elsewhere. - raise IndexError - - # # take on empty array not handled as desired by numpy - # # in case of -1 (all missing take) - # if not len(arr) and mask.all(): - # return arr._from_sequence([fill_value] * len(indexer)) - result = take_1d(arr, indexer, fill_value=fill_value) + # Use for bounds checking, we don't actually want to convert. + validate_indices(indexer, len(arr)) + result = take_1d(arr, indexer, allow_fill=True, fill_value=fill_value) else: # NumPy style result = arr.take(indexer) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 2eb52ecc6bcc7..fe6d6775c4e0b 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -2417,12 +2417,53 @@ def maybe_convert_indices(indices, n): mask = indices < 0 if mask.any(): indices[mask] += n + mask = (indices >= n) | (indices < 0) if mask.any(): raise IndexError("indices are out-of-bounds") return indices +def validate_indices(indices, n): + """Perform bounds-checking for an indexer. + + -1 is allowed for indicating missing values. + + Parameters + ---------- + indices : ndarray + n : int + length of the array being indexed + + Raises + ------ + ValueError + + Examples + -------- + >>> validate_indices([1, 2], 3) + # OK + >>> validate_indices([1, -2], 3) + ValueError + >>> validate_indices([1, 2, 3], 3) + IndexError + >>> validate_indices([-1, -1], 0) + # OK + >>> validate_indices([0, 1], 0) + IndexError + """ + if len(indices): + min_idx = indices.min() + if min_idx < -1: + msg = ("'indices' contains values less than allowed ({} < {})" + .format(min_idx, -1)) + raise ValueError(msg) + + max_idx = indices.max() + if max_idx >= n: + raise IndexError("indices are out-of-bounds") + + def maybe_convert_ix(*args): """ We likely want to take the cross-product diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index b5c87d1a858d8..874ee065755c5 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -134,8 +134,12 @@ def test_take(self, data, na_value, na_cmp): def test_take_empty(self, data, na_value, na_cmp): empty = data[:0] - # result = empty.take([-1]) - # na_cmp(result[0], na_value) + + result = empty.take([-1], allow_fill=True) + na_cmp(result[0], na_value) + + with pytest.raises(IndexError): + empty.take([-1]) with tm.assert_raises_regex(IndexError, "cannot do a non-empty take"): empty.take([0, 1]) @@ -160,6 +164,12 @@ def test_take_pandas_style_negative_raises(self, data, na_value): with pytest.raises(ValueError): data.take([0, -2], fill_value=na_value, allow_fill=True) + @pytest.mark.parametrize('allow_fill', [True, False]) + def test_take_out_of_bounds_raises(self, data, allow_fill): + arr = data[:3] + with pytest.raises(IndexError): + arr.take(np.asarray([0, 3]), allow_fill=allow_fill) + @pytest.mark.xfail(reason="Series.take with extension array buggy for -1") def test_take_series(self, data): s = pd.Series(data) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index b624dda382dae..046640ade3595 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -107,6 +107,10 @@ def test_take_pandas_style_negative_raises(self): def test_take_non_na_fill_value(self): pass + @skip_take + def test_take_out_of_bounds_raises(self): + pass + @pytest.mark.xfail(reason="Categorical.take buggy") def test_take_empty(self): pass diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 2be57643b4c99..b03538185b608 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -14,7 +14,11 @@ class JSONDtype(ExtensionDtype): type = collections.Mapping name = 'json' - na_value = collections.UserDict() + try: + na_value = collections.UserDict() + except AttributeError: + # source compatibility with Py2. + na_value = {} @classmethod def construct_from_string(cls, string): @@ -112,7 +116,7 @@ def take(self, indexer, allow_fill=False, fill_value=None): output = [self.data[loc] if loc != -1 else fill_value for loc in indexer] except IndexError: - raise msg + raise IndexError(msg) else: try: output = [self.data[loc] for loc in indexer] diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index c66310d10ebdc..04225454f61f9 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -16,7 +16,8 @@ import numpy as np import pandas as pd -from pandas.core.indexing import _non_reducing_slice, _maybe_numeric_slice +from pandas.core.indexing import (_non_reducing_slice, _maybe_numeric_slice, + validate_indices) from pandas import NaT, DataFrame, Index, Series, MultiIndex import pandas.util.testing as tm @@ -994,3 +995,27 @@ def test_none_coercion_mixed_dtypes(self): datetime(2000, 1, 3)], 'd': [None, 'b', 'c']}) tm.assert_frame_equal(start_dataframe, exp) + + +def test_validate_indices_ok(): + indices = np.asarray([0, 1]) + validate_indices(indices, 2) + validate_indices(indices[:0], 0) + validate_indices(np.array([-1, -1]), 0) + + +def test_validate_indices_low(): + indices = np.asarray([0, -2]) + with tm.assert_raises_regex(ValueError, "'indices' contains"): + validate_indices(indices, 2) + + +def test_validate_indices_high(): + indices = np.asarray([0, 1, 2]) + with tm.assert_raises_regex(IndexError, "indices are out"): + validate_indices(indices, 2) + + +def test_validate_indices_empty(): + with tm.assert_raises_regex(IndexError, "indices are out"): + validate_indices(np.array([0, 1]), 0) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 8a8a6f7de70d7..f99f18ac4b9bb 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1564,3 +1564,39 @@ def test_index(self): idx = Index(['1 day', '1 day', '-1 day', '-1 day 2 min', '2 min', '2 min'], dtype='timedelta64[ns]') tm.assert_series_equal(algos.mode(idx), exp) + + +class TestTake(object): + + def test_bounds_check_large(self): + arr = np.array([1, 2]) + with pytest.raises(IndexError): + algos.take(arr, [2, 3], allow_fill=True) + + with pytest.raises(IndexError): + algos.take(arr, [2, 3], allow_fill=False) + + def test_bounds_check_small(self): + arr = np.array([1, 2, 3], dtype=np.int64) + indexer = [0, -1, -2] + with pytest.raises(ValueError): + algos.take(arr, indexer, allow_fill=True) + + result = algos.take(arr, indexer) + expected = np.array([1, 3, 2], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize('allow_fill', [True, False]) + def test_take_empty(self, allow_fill): + arr = np.array([], dtype=np.int64) + # empty take is ok + result = algos.take(arr, [], allow_fill=allow_fill) + tm.assert_numpy_array_equal(arr, result) + + with pytest.raises(IndexError): + algos.take(arr, [0], allow_fill=allow_fill) + + def test_take_na_empty(self): + result = algos.take([], [-1, -1], allow_fill=True, fill_value=0) + expected = np.array([0, 0], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) From 82cad8b1f369db80851c13fe856a30bdb8bd154c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 15:00:43 -0500 Subject: [PATCH 16/29] Stale comment --- pandas/core/algorithms.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 6cb0763ffe9dc..9b1637fcd824f 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1495,7 +1495,6 @@ def take(arr, indexer, allow_fill=False, fill_value=None): if allow_fill: # Pandas style, -1 means NA - # Use for bounds checking, we don't actually want to convert. validate_indices(indexer, len(arr)) result = take_1d(arr, indexer, allow_fill=True, fill_value=fill_value) else: From d5470a0f9f4d2f3937f0e63256f0750a50276586 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 15:02:26 -0500 Subject: [PATCH 17/29] Fixed reorder --- pandas/core/arrays/base.py | 59 +++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 813f0be538ed9..f03e3c927cbd8 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -276,23 +276,22 @@ def isna(self): """ raise AbstractMethodError(self) - def _values_for_factorize(self): - # type: () -> Tuple[ndarray, Any] - """Return an array and missing value suitable for factorization. + def _values_for_argsort(self): + # type: () -> ndarray + """Return values for sorting. Returns ------- - values : ndarray - An array suitable for factoraization. This should maintain order - and be a supported dtype (Float64, Int64, UInt64, String, Object). - By default, the extension array is cast to object dtype. - na_value : object - The value in `values` to consider missing. This will be treated - as NA in the factorization routines, so it will be coded as - `na_sentinal` and not included in `uniques`. By default, - ``np.nan`` is used. + ndarray + The transformed values should maintain the ordering between values + within the array. + + See Also + -------- + ExtensionArray.argsort """ - return self.astype(object), np.nan + # Note: this is used in `ExtensionArray.argsort`. + return np.array(self) def argsort(self, ascending=True, kind='quicksort', *args, **kwargs): """ @@ -393,6 +392,24 @@ def unique(self): uniques = unique(self.astype(object)) return self._from_sequence(uniques) + def _values_for_factorize(self): + # type: () -> Tuple[ndarray, Any] + """Return an array and missing value suitable for factorization. + + Returns + ------- + values : ndarray + An array suitable for factoraization. This should maintain order + and be a supported dtype (Float64, Int64, UInt64, String, Object). + By default, the extension array is cast to object dtype. + na_value : object + The value in `values` to consider missing. This will be treated + as NA in the factorization routines, so it will be coded as + `na_sentinal` and not included in `uniques`. By default, + ``np.nan`` is used. + """ + return self.astype(object), np.nan + def factorize(self, na_sentinel=-1): # type: (int) -> Tuple[ndarray, ExtensionArray] """Encode the extension array as an enumerated type. @@ -445,22 +462,6 @@ def factorize(self, na_sentinel=-1): # ------------------------------------------------------------------------ # Indexing methods # ------------------------------------------------------------------------ - def _values_for_argsort(self): - # type: () -> ndarray - """Return values for sorting. - - Returns - ------- - ndarray - The transformed values should maintain the ordering between values - within the array. - - See Also - -------- - ExtensionArray.argsort - """ - # Note: this is used in `ExtensionArray.argsort`. - return np.array(self) def take(self, indexer, allow_fill=False, fill_value=None): # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray From bbcbf19a560efed894670b0bc495a80283e003fc Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 15:07:28 -0500 Subject: [PATCH 18/29] Cleanup --- pandas/core/generic.py | 1 - pandas/core/series.py | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 508fa14a325b3..6d55f92167d3b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7420,7 +7420,6 @@ def _where(self, cond, other=np.nan, inplace=False, axis=None, level=None, if other.ndim <= self.ndim: _, other = self.align(other, join='left', axis=axis, - # XXX level=level, fill_value=np.nan) # if we are NOT aligned, raise as we cannot where index diff --git a/pandas/core/series.py b/pandas/core/series.py index 0c2eb40567c55..0d7288edac5d3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3216,10 +3216,8 @@ def _reindex_indexer(self, new_index, indexer, copy): return self.copy() return self - # TODO: determine if we want EA to handle fill_value=None - # if not, then we have to determine this here. new_values = algorithms.take_1d(self._values, indexer, - fill_value=None, allow_fill=True) + allow_fillTrue, fill_value=None) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): From 1a4d9873629bb28e19a124e40b063879fc5e3ad0 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 15:50:48 -0500 Subject: [PATCH 19/29] Pass an array --- pandas/tests/test_algos.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index f99f18ac4b9bb..3c8c91ab30d3f 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1597,6 +1597,7 @@ def test_take_empty(self, allow_fill): algos.take(arr, [0], allow_fill=allow_fill) def test_take_na_empty(self): - result = algos.take([], [-1, -1], allow_fill=True, fill_value=0) - expected = np.array([0, 0], dtype=np.int64) + result = algos.take(np.array([]), [-1, -1], allow_fill=True, + fill_value=0.0) + expected = np.array([0., 0.]) tm.assert_numpy_array_equal(result, expected) From fc729d6dea77dee0d89148cdda738c0e9d8ab7c5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 15:51:27 -0500 Subject: [PATCH 20/29] Fixed editor --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0d7288edac5d3..7abd95c68ea2b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3217,7 +3217,7 @@ def _reindex_indexer(self, new_index, indexer, copy): return self new_values = algorithms.take_1d(self._values, indexer, - allow_fillTrue, fill_value=None) + allow_fill=True, fill_value=None) return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): From 74b2c09bfc410be7190bbf6c69cb2c19fee6fbdd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 25 Apr 2018 21:40:18 -0500 Subject: [PATCH 21/29] Added verisonadded --- pandas/core/algorithms.py | 2 ++ pandas/core/dtypes/base.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9b1637fcd824f..796e5c8c6eb91 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1451,6 +1451,8 @@ def func(arr, indexer, out, fill_value=np.nan): def take(arr, indexer, allow_fill=False, fill_value=None): """Take elements from an array. + .. versionadded:: 0.23.0 + Parameters ---------- arr : ndarray or ExtensionArray diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index f6311bdf04f08..9769578885b9c 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -96,6 +96,8 @@ def is_dtype(cls, dtype): class ExtensionDtype(_DtypeOpsMixin): """A custom data type, to be paired with an ExtensionArray. + .. versionadded:: 0.23.0 + Notes ----- The interface includes the following abstract methods that must From 5db66248f2c40d5de6ffe9a2c308830c4128079a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 07:17:30 -0500 Subject: [PATCH 22/29] Doc and move tests --- pandas/core/algorithms.py | 49 +++++++++++++++++++++++++++++++------- pandas/tests/test_algos.py | 37 ---------------------------- pandas/tests/test_take.py | 45 ++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 46 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 796e5c8c6eb91..d456e2542402a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1455,19 +1455,21 @@ def take(arr, indexer, allow_fill=False, fill_value=None): Parameters ---------- - arr : ndarray or ExtensionArray + arr : sequence + Non array-likes (sequences without a dtype) are coereced + to an ndarray. indexer : sequence of integers - Indices to be taken. See Notes for how negative indicies - are handled. + Indices to be taken. allow_fill : bool, default False How to handle negative values in `indexer`. - For False values (the default), negative values in `indexer` - indiciate slices from the right. + * False: negative values in `indexer` indicate + slices from the right (the default) + + * True: negative values in `indexer` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. - For True values, indicies where `indexer` is ``-1`` indicate - missing values. These values are set to `fill_value`. Any other - other negative value should raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indicies when `allow_fill` is True. This may be ``None``, in which case the default NA value for @@ -1486,13 +1488,42 @@ def take(arr, indexer, allow_fill=False, fill_value=None): When the indexer contains negative values other than ``-1`` and `allow_fill` is True. + Notes + ----- + When `allow_fill` is False, `indexer` may be whatever dimensionality + is accepted by NumPy for `arr`. + + When `allow_fill` is True, `indexer` should be 1-D. + See Also -------- numpy.take + + Examples + -------- + >>> from pandas.api.extensions import take + + With the default ``allow_fill=False``, negative numbers indicate + slices from the right. + + >>> take(np.array([10, 20, 30]), [0, 0, -1]) + array([10, 10, 30]) + + Setting ``allow_fill=True`` will place `fill_value` in those positions. + + >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True) + array([10., 10., nan]) + + >>> take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True, + ... fill_value=-10) + array([ 10, 10, -10]) """ from pandas.core.indexing import validate_indices - # Do we require int64 here? + if not is_array_like(arr): + arr = np.asarray(arr) + + # Do we require int64 or intp here? indexer = np.asarray(indexer, dtype='int') if allow_fill: diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3c8c91ab30d3f..8a8a6f7de70d7 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1564,40 +1564,3 @@ def test_index(self): idx = Index(['1 day', '1 day', '-1 day', '-1 day 2 min', '2 min', '2 min'], dtype='timedelta64[ns]') tm.assert_series_equal(algos.mode(idx), exp) - - -class TestTake(object): - - def test_bounds_check_large(self): - arr = np.array([1, 2]) - with pytest.raises(IndexError): - algos.take(arr, [2, 3], allow_fill=True) - - with pytest.raises(IndexError): - algos.take(arr, [2, 3], allow_fill=False) - - def test_bounds_check_small(self): - arr = np.array([1, 2, 3], dtype=np.int64) - indexer = [0, -1, -2] - with pytest.raises(ValueError): - algos.take(arr, indexer, allow_fill=True) - - result = algos.take(arr, indexer) - expected = np.array([1, 3, 2], dtype=np.int64) - tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize('allow_fill', [True, False]) - def test_take_empty(self, allow_fill): - arr = np.array([], dtype=np.int64) - # empty take is ok - result = algos.take(arr, [], allow_fill=allow_fill) - tm.assert_numpy_array_equal(arr, result) - - with pytest.raises(IndexError): - algos.take(arr, [0], allow_fill=allow_fill) - - def test_take_na_empty(self): - result = algos.take(np.array([]), [-1, -1], allow_fill=True, - fill_value=0.0) - expected = np.array([0., 0.]) - tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index 7b97b0e975df3..2b78c91f9dac5 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -3,6 +3,7 @@ from datetime import datetime import numpy as np +import pytest from pandas.compat import long import pandas.core.algorithms as algos import pandas.util.testing as tm @@ -445,3 +446,47 @@ def test_2d_datetime64(self): expected = arr.take(indexer, axis=1) expected[:, [2, 4]] = datetime(2007, 1, 1) tm.assert_almost_equal(result, expected) + + +class TestExtensionTake(object): + # The take method found in pd.api.extensions + + def test_bounds_check_large(self): + arr = np.array([1, 2]) + with pytest.raises(IndexError): + algos.take(arr, [2, 3], allow_fill=True) + + with pytest.raises(IndexError): + algos.take(arr, [2, 3], allow_fill=False) + + def test_bounds_check_small(self): + arr = np.array([1, 2, 3], dtype=np.int64) + indexer = [0, -1, -2] + with pytest.raises(ValueError): + algos.take(arr, indexer, allow_fill=True) + + result = algos.take(arr, indexer) + expected = np.array([1, 3, 2], dtype=np.int64) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize('allow_fill', [True, False]) + def test_take_empty(self, allow_fill): + arr = np.array([], dtype=np.int64) + # empty take is ok + result = algos.take(arr, [], allow_fill=allow_fill) + tm.assert_numpy_array_equal(arr, result) + + with pytest.raises(IndexError): + algos.take(arr, [0], allow_fill=allow_fill) + + def test_take_na_empty(self): + result = algos.take(np.array([]), [-1, -1], allow_fill=True, + fill_value=0.0) + expected = np.array([0., 0.]) + tm.assert_numpy_array_equal(result, expected) + + def test_take_coerces_list(self): + arr = [1, 2, 3] + result = algos.take(arr, [0, 0]) + expected = np.array([1, 1]) + tm.assert_numpy_array_equal(result, expected) From fbc4425a09bb973fefae90b3da2e1a205129658e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 13:37:45 -0500 Subject: [PATCH 23/29] Updates * indexer -> indices * doc user-facing vs physical * assert na_cmps * test reindex w/ non-NA fill_value --- pandas/core/algorithms.py | 32 ++++++++++--------- pandas/core/arrays/base.py | 29 +++++++++++------ pandas/core/dtypes/base.py | 3 +- pandas/core/internals.py | 3 -- pandas/tests/extension/base/getitem.py | 20 ++++++++++-- .../tests/extension/decimal/test_decimal.py | 10 +++++- pandas/tests/extension/json/array.py | 18 +++++++---- 7 files changed, 76 insertions(+), 39 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f2fa49cbe1aa3..974e15d926d75 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1448,8 +1448,9 @@ def func(arr, indexer, out, fill_value=np.nan): return func -def take(arr, indexer, allow_fill=False, fill_value=None): - """Take elements from an array. +def take(arr, indices, allow_fill=False, fill_value=None): + """ + Take elements from an array. .. versionadded:: 0.23.0 @@ -1458,22 +1459,23 @@ def take(arr, indexer, allow_fill=False, fill_value=None): arr : sequence Non array-likes (sequences without a dtype) are coereced to an ndarray. - indexer : sequence of integers + indices : sequence of integers Indices to be taken. allow_fill : bool, default False - How to handle negative values in `indexer`. + How to handle negative values in `indices`. - * False: negative values in `indexer` indicate - slices from the right (the default) + * False: negative values in `indices` indicate indexing from + the right (the default). This is similar to :func:`numpy.take`. - * True: negative values in `indexer` indicate + * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other other negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indicies when `allow_fill` is True. This may be ``None``, in which case the default NA value for - the type, ``self.dtype.na_value``, is used. + the type is used. For ndarrays, :attr:`numpy.nan` is used. For + ExtensionArrays, a different value may be used. Returns ------- @@ -1483,17 +1485,17 @@ def take(arr, indexer, allow_fill=False, fill_value=None): Raises ------ IndexError - When the indexer is out of bounds for the array. + When `indices` is out of bounds for the array. ValueError When the indexer contains negative values other than ``-1`` and `allow_fill` is True. Notes ----- - When `allow_fill` is False, `indexer` may be whatever dimensionality + When `allow_fill` is False, `indices` may be whatever dimensionality is accepted by NumPy for `arr`. - When `allow_fill` is True, `indexer` should be 1-D. + When `allow_fill` is True, `indices` should be 1-D. See Also -------- @@ -1524,15 +1526,15 @@ def take(arr, indexer, allow_fill=False, fill_value=None): arr = np.asarray(arr) # Do we require int64 or intp here? - indexer = np.asarray(indexer, dtype='int') + indices = np.asarray(indices, dtype='int') if allow_fill: # Pandas style, -1 means NA - validate_indices(indexer, len(arr)) - result = take_1d(arr, indexer, allow_fill=True, fill_value=fill_value) + validate_indices(indices, len(arr)) + result = take_1d(arr, indices, allow_fill=True, fill_value=fill_value) else: # NumPy style - result = arr.take(indexer) + result = arr.take(indices) return result diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index f03e3c927cbd8..aa00ad2c0df9e 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -463,22 +463,22 @@ def factorize(self, na_sentinel=-1): # Indexing methods # ------------------------------------------------------------------------ - def take(self, indexer, allow_fill=False, fill_value=None): + def take(self, indices, allow_fill=False, fill_value=None): # type: (Sequence[int], bool, Optional[Any]) -> ExtensionArray """Take elements from an array. Parameters ---------- - indexer : sequence of integers + indices : sequence of integers Indices to be taken. See Notes for how negative indicies are handled. allow_fill : bool, default False - How to handle negative values in `indexer`. + How to handle negative values in `indices`. - For False values (the default), negative values in `indexer` + For False values (the default), negative values in `indices` indiciate slices from the right. - For True values, indicies where `indexer` is ``-1`` indicate + For True values, indicies where `indices` is ``-1`` indicate missing values. These values are set to `fill_value`. Any other other negative value should raise a ``ValueError``. fill_value : any, optional @@ -486,6 +486,12 @@ def take(self, indexer, allow_fill=False, fill_value=None): This may be ``None``, in which case the default NA value for the type, ``self.dtype.na_value``, is used. + For many ExtensionArrays, there will be two representations of + `fill_value`: a user-facing "boxed" scalar, and a low-level + physical NA value. `fill_value` should be the user-facing version, + and the implementation should handle translating that to the + physical version for processing the take if nescessary. + Returns ------- ExtensionArray @@ -493,15 +499,15 @@ def take(self, indexer, allow_fill=False, fill_value=None): Raises ------ IndexError - When the indexer is out of bounds for the array. + When the indices are out of bounds for the array. ValueError - When the indexer contains negative values other than ``-1`` + When `indices` contains negative values other than ``-1`` and `allow_fill` is True. Notes ----- ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, - ``iloc``, when the indexer is a sequence of values. Additionally, + ``iloc``, when `indices` is a sequence of values. Additionally, it's called by :meth:`Series.reindex`, or any other method that causes realignemnt, with a `fill_value`. @@ -518,14 +524,17 @@ def take(self, indexer, allow_fill=False, fill_value=None): .. code-block:: python - def take(self, indexer, allow_fill=False, fill_value=None): + def take(self, indices, allow_fill=False, fill_value=None): from pandas.core.algorithms import take + # If the ExtensionArray is backed by an ndarray, then + # just pass that here instead of coercing to object. data = self.astype(object) + if allow_fill and fill_value is None: fill_value = self.dtype.na_value - result = take(data, indexer, fill_value=fill_value, + result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) return self._from_sequence(result) """ diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index 9769578885b9c..ffedfc04120b2 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -17,7 +17,8 @@ class _DtypeOpsMixin(object): # class's methods can be moved to ExtensionDtype and removed. # na_value is the default NA value to use for this type. This is used in - # e.g. ExtensionArray.take. + # e.g. ExtensionArray.take. This should be the user-facing "boxed" version + # of the NA value, not the physical NA vaalue for storage. na_value = np.nan def __eq__(self, other): diff --git a/pandas/core/internals.py b/pandas/core/internals.py index e2614ee01d32b..474894aba65df 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -5405,9 +5405,6 @@ def concatenate_block_managers(mgrs_indexers, axes, concat_axis, copy): for placement, join_units in concat_plan: - # The issue: we have a join unit (or maybe several) that needs to be - # reindexed. - if len(join_units) == 1 and not join_units[0].indexers: b = join_units[0].block values = b.values diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 874ee065755c5..5c9ede1079079 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -127,7 +127,11 @@ def test_take(self, data, na_value, na_cmp): result = data.take([0, -1]) assert result.dtype == data.dtype assert result[0] == data[0] - na_cmp(result[1], na_value) + assert result[1] == data[-1] + + result = data.take([0, -1], allow_fill=True, fill_value=na_value) + assert result[0] == data[0] + assert na_cmp(result[1], na_value) with tm.assert_raises_regex(IndexError, "out of bounds"): data.take([len(data) + 1]) @@ -136,7 +140,7 @@ def test_take_empty(self, data, na_value, na_cmp): empty = data[:0] result = empty.take([-1], allow_fill=True) - na_cmp(result[0], na_value) + assert na_cmp(result[0], na_value) with pytest.raises(IndexError): empty.take([-1]) @@ -170,7 +174,6 @@ def test_take_out_of_bounds_raises(self, data, allow_fill): with pytest.raises(IndexError): arr.take(np.asarray([0, 3]), allow_fill=allow_fill) - @pytest.mark.xfail(reason="Series.take with extension array buggy for -1") def test_take_series(self, data): s = pd.Series(data) result = s.take([0, -1]) @@ -196,3 +199,14 @@ def test_reindex(self, data, na_value): expected = pd.Series(data._from_sequence([na_value, na_value]), index=[n, n + 1]) self.assert_series_equal(result, expected) + + def test_reindex_non_na_fill_value(self, data_missing): + valid = data_missing[1] + na = data_missing[0] + + array = data_missing._from_sequence([na, valid]) + ser = pd.Series(array) + result = ser.reindex([0, 1, 2], fill_value=valid) + expected = pd.Series(data_missing._from_sequence([na, valid, valid])) + + self.assert_series_equal(result, expected) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 53d74cd6d38cb..1f8cf0264f62f 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -108,7 +108,15 @@ class TestReshaping(BaseDecimal, base.BaseReshapingTests): class TestGetitem(BaseDecimal, base.BaseGetitemTests): - pass + + def test_take_na_value_other_decimal(self): + arr = DecimalArray([decimal.Decimal('1.0'), + decimal.Decimal('2.0')]) + result = arr.take([0, -1], allow_fill=True, + fill_value=decimal.Decimal('-1.0')) + expected = DecimalArray([decimal.Decimal('1.0'), + decimal.Decimal('-1.0')]) + self.assert_extension_array_equal(result, expected) class TestMissing(BaseDecimal, base.BaseMissingTests): diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index b03538185b608..4066ce70f5d72 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -1,3 +1,15 @@ +"""Test extension array for storing nested data in a pandas container. + +The JSONArray stores lists of dictionaries. The storage mechanism is a list, +not an ndarray. + +Note: + +We currently store lists of UserDicts (Py3 only). Pandas has a few places +internally that specifically check for dicts, and does non-scalar things +in that case. We *want* the dictionaries to be treated as scalars, so we +hack around pandas by using UserDicts. +""" import collections import itertools import numbers @@ -125,12 +137,6 @@ def take(self, indexer, allow_fill=False, fill_value=None): return self._from_sequence(output) - # def astype(self, dtype, copy=True): - # # NumPy has issues when all the dicts are the same length. - # # np.array([UserDict(...), UserDict(...)]) fails, - # # but np.array([{...}, {...}]) works, so cast. - # return np.array([dict(x) for x in self], dtype=dtype, copy=copy) - def copy(self, deep=False): return type(self)(self.data[:]) From f3b91ca70ce9caacae29f3ecd80cd121f5ee1a33 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 13:43:26 -0500 Subject: [PATCH 24/29] doc fixup --- pandas/core/algorithms.py | 6 +++--- pandas/core/arrays/base.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 974e15d926d75..0300bc7a5eb7a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1464,8 +1464,8 @@ def take(arr, indices, allow_fill=False, fill_value=None): allow_fill : bool, default False How to handle negative values in `indices`. - * False: negative values in `indices` indicate indexing from - the right (the default). This is similar to :func:`numpy.take`. + * False: negative values in `indices` indicate positional indicies + from the right (the default). This is similar to :func:`numpy.take`. * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other @@ -1506,7 +1506,7 @@ def take(arr, indices, allow_fill=False, fill_value=None): >>> from pandas.api.extensions import take With the default ``allow_fill=False``, negative numbers indicate - slices from the right. + positional indicies from the right. >>> take(np.array([10, 20, 30]), [0, 0, -1]) array([10, 10, 30]) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index aa00ad2c0df9e..35906a1d08558 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -470,17 +470,17 @@ def take(self, indices, allow_fill=False, fill_value=None): Parameters ---------- indices : sequence of integers - Indices to be taken. See Notes for how negative indicies - are handled. + Indices to be taken. allow_fill : bool, default False How to handle negative values in `indices`. For False values (the default), negative values in `indices` - indiciate slices from the right. + indiciate positional indicies from the right. For True values, indicies where `indices` is ``-1`` indicate missing values. These values are set to `fill_value`. Any other other negative value should raise a ``ValueError``. + fill_value : any, optional Fill value to use for NA-indicies when `allow_fill` is True. This may be ``None``, in which case the default NA value for From eecd632b16d475a5bbdca2013839298904056997 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 15:00:00 -0500 Subject: [PATCH 25/29] Skip categorical take tests --- pandas/tests/extension/category/test_categorical.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index 046640ade3595..c34339c99322d 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -111,6 +111,14 @@ def test_take_non_na_fill_value(self): def test_take_out_of_bounds_raises(self): pass + @skip_take + def test_take_series(self): + pass + + @skip_take + def test_reindex_non_na_fill_value(self): + pass + @pytest.mark.xfail(reason="Categorical.take buggy") def test_take_empty(self): pass From 9a6c7d44ce9f474acb68d7b62efc138aedc3a100 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 20:04:17 -0500 Subject: [PATCH 26/29] Doc updates --- pandas/core/algorithms.py | 14 ++++++-------- pandas/core/arrays/base.py | 20 ++++++++++++++------ 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 0300bc7a5eb7a..a03d892432b51 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1464,7 +1464,7 @@ def take(arr, indices, allow_fill=False, fill_value=None): allow_fill : bool, default False How to handle negative values in `indices`. - * False: negative values in `indices` indicate positional indicies + * False: negative values in `indices` indicate positional indices from the right (the default). This is similar to :func:`numpy.take`. * True: negative values in `indices` indicate @@ -1472,10 +1472,9 @@ def take(arr, indices, allow_fill=False, fill_value=None): other negative values raise a ``ValueError``. fill_value : any, optional - Fill value to use for NA-indicies when `allow_fill` is True. + Fill value to use for NA-indices when `allow_fill` is True. This may be ``None``, in which case the default NA value for - the type is used. For ndarrays, :attr:`numpy.nan` is used. For - ExtensionArrays, a different value may be used. + the type (``self.dtype.na_value``) is used. Returns ------- @@ -1506,7 +1505,7 @@ def take(arr, indices, allow_fill=False, fill_value=None): >>> from pandas.api.extensions import take With the default ``allow_fill=False``, negative numbers indicate - positional indicies from the right. + positional indices from the right. >>> take(np.array([10, 20, 30]), [0, 0, -1]) array([10, 10, 30]) @@ -1525,8 +1524,7 @@ def take(arr, indices, allow_fill=False, fill_value=None): if not is_array_like(arr): arr = np.asarray(arr) - # Do we require int64 or intp here? - indices = np.asarray(indices, dtype='int') + indices = np.asarray(indices, dtype=np.intp) if allow_fill: # Pandas style, -1 means NA @@ -1552,7 +1550,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, Input array. indexer : ndarray 1-D array of indices to take, subarrays corresponding to -1 value - indicies are filed with fill_value + indices are filed with fill_value axis : int, default 0 Axis to take from out : ndarray or None, default None diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 35906a1d08558..da5f7a944cdaa 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -474,15 +474,16 @@ def take(self, indices, allow_fill=False, fill_value=None): allow_fill : bool, default False How to handle negative values in `indices`. - For False values (the default), negative values in `indices` - indiciate positional indicies from the right. + * False: negative values in `indices` indicate positional indices + from the right (the default). This is similar to + :func:`numpy.take`. - For True values, indicies where `indices` is ``-1`` indicate - missing values. These values are set to `fill_value`. Any other - other negative value should raise a ``ValueError``. + * True: negative values in `indices` indicate + missing values. These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. fill_value : any, optional - Fill value to use for NA-indicies when `allow_fill` is True. + Fill value to use for NA-indices when `allow_fill` is True. This may be ``None``, in which case the default NA value for the type, ``self.dtype.na_value``, is used. @@ -538,6 +539,13 @@ def take(self, indices, allow_fill=False, fill_value=None): allow_fill=allow_fill) return self._from_sequence(result) """ + # Implementer note: The `fill_value` parameter should be a user-facing + # value, an instance of self.dtype.type. When passed `fill_value=None`, + # the default of `self.dtype.na_value` should be used. + # This may differ from the physical storage type your ExtensionArray + # uses. In this case, your implementation is responsible for casting + # the user-facing type to the storage type, before using + # pandas.api.extensions.take raise AbstractMethodError(self) def copy(self, deep=False): From eb43fa4159c2affaf57ad686c260b2d1899eeacd Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 26 Apr 2018 20:47:35 -0500 Subject: [PATCH 27/29] Really truly fix it hopefully. --- pandas/tests/extension/json/test_json.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 4d86dc65d28a5..1fdb7298eefc4 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -21,6 +21,17 @@ def dtype(): @pytest.fixture def data(): """Length-100 PeriodArray for semantics test.""" + data = make_data() + + # Why the while loop? NumPy is unable to construct an ndarray from + # equal-length ndarrays. Many of our operations involve coercing the + # EA to an ndarray of objects. To avoid random test failures, we ensure + # that our data is coercable to an ndarray. Several tests deal with only + # the first two elements, so that's what we'll check. + + while len(data[0]) == len(data[1]): + data = make_data() + return JSONArray(make_data()) From 6858409f3394035483e327c6e0fca05c1a6285ff Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 27 Apr 2018 05:48:59 -0500 Subject: [PATCH 28/29] Added note --- pandas/core/arrays/base.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 4c7eaa61c208e..1922801c30719 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -535,6 +535,10 @@ def take(self, indices, allow_fill=False, fill_value=None): if allow_fill and fill_value is None: fill_value = self.dtype.na_value + # fill value should always be translated from the scalar + # type for the array, to the physical storage type for + # the data, before passing to take. + result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill) return self._from_sequence(result) From ec0cecd292947aa4d8416991e9f8920a4cd9a831 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 27 Apr 2018 06:02:48 -0500 Subject: [PATCH 29/29] Updates --- pandas/core/dtypes/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index ffedfc04120b2..49e98c16c716e 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -19,6 +19,7 @@ class _DtypeOpsMixin(object): # na_value is the default NA value to use for this type. This is used in # e.g. ExtensionArray.take. This should be the user-facing "boxed" version # of the NA value, not the physical NA vaalue for storage. + # e.g. for JSONArray, this is an empty dictionary. na_value = np.nan def __eq__(self, other):