diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index c128058858c17..2440a8d586c96 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -891,6 +891,7 @@ Deprecations removed in a future version (:issue:`20419`). - ``DatetimeIndex.offset`` is deprecated. Use ``DatetimeIndex.freq`` instead (:issue:`20716`) - ``Index.get_duplicates()`` is deprecated and will be removed in a future version (:issue:`20239`) +- The previous default behavior of negative indices in ``Categorical.take`` is deprecated. In a future version it will change from meaning missing values to meaning positional indices from the right. The future behavior is consistent with :meth:`Series.take` (:issue:`20664`). .. _whatsnew_0230.prior_deprecations: @@ -1024,6 +1025,7 @@ Categorical - Bug in ``Categorical.__iter__`` not converting to Python types (:issue:`19909`) - Bug in :func:`pandas.factorize` returning the unique codes for the ``uniques``. This now returns a ``Categorical`` with the same dtype as the input (:issue:`19721`) - Bug in :func:`pandas.factorize` including an item for missing values in the ``uniques`` return value (:issue:`19721`) +- Bug in :meth:`Series.take` with categorical data interpreting ``-1`` in `indices` as missing value markers, rather than the last element of the Series (:issue:`20664`) Datetimelike ^^^^^^^^^^^^ diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 7f0d54de9def8..517c21cc1bc3a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2,6 +2,7 @@ import numpy as np from warnings import warn +import textwrap import types from pandas import compat @@ -29,7 +30,7 @@ is_scalar, is_dict_like) -from pandas.core.algorithms import factorize, take_1d, unique1d +from pandas.core.algorithms import factorize, take_1d, unique1d, take from pandas.core.accessor import PandasDelegate from pandas.core.base import (PandasObject, NoNewAttributesMixin, _shared_docs) @@ -48,6 +49,17 @@ from .base import ExtensionArray +_take_msg = textwrap.dedent("""\ + Interpreting negative values in 'indexer' as missing values. + In the future, this will change to meaning positional indicies + from the right. + + Use 'allow_fill=True' to retain the previous behavior and silence this + warning. + + Use 'allow_fill=False' to accept the new behavior.""") + + def _cat_compare_op(op): def f(self, other): # On python2, you can usually compare any type to any type, and @@ -1732,17 +1744,49 @@ def fillna(self, value=None, method=None, limit=None): return self._constructor(values, categories=self.categories, ordered=self.ordered, fastpath=True) - def take_nd(self, indexer, allow_fill=True, fill_value=None): - """ Take the codes by the indexer, fill with the fill_value. - - For internal compatibility with numpy arrays. + def take_nd(self, indexer, allow_fill=None, fill_value=None): """ + Take elements from the Categorical. - # filling must always be None/nan here - # but is passed thru internally - assert isna(fill_value) + Parameters + ---------- + indexer : sequence of integers + allow_fill : bool, default None. + How to handle negative values in `indexer`. - codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) + * False: negative values in `indices` indicate positional indices + from the right. This is similar to + :func:`numpy.take`. + + * True: negative values in `indices` indicate missing values + (the default). These values are set to `fill_value`. Any other + other negative values raise a ``ValueError``. + + .. versionchanged:: 0.23.0 + + Deprecated the default value of `allow_fill`. The deprecated + default is ``True``. In the future, this will change to + ``False``. + + Returns + ------- + Categorical + This Categorical will have the same categories and ordered as + `self`. + """ + indexer = np.asarray(indexer, dtype=np.intp) + if allow_fill is None: + if (indexer < 0).any(): + warn(_take_msg, FutureWarning, stacklevel=2) + allow_fill = True + + if isna(fill_value): + # For categorical, any NA value is considered a user-facing + # NA value. Our storage NA value is -1. + fill_value = -1 + + codes = take(self._codes, indexer, allow_fill=allow_fill, + fill_value=fill_value) result = self._constructor(codes, categories=self.categories, ordered=self.ordered, fastpath=True) return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 7abd95c68ea2b..a14f3299e11e9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3499,7 +3499,14 @@ def _take(self, indices, axis=0, convert=True, is_copy=False): indices = _ensure_platform_int(indices) new_index = self.index.take(indices) - new_values = self._values.take(indices) + + if is_categorical_dtype(self): + # https://github.com/pandas-dev/pandas/issues/20664 + # TODO: remove when the default Categorical.take behavior changes + kwargs = {'allow_fill': False} + else: + kwargs = {} + new_values = self._values.take(indices, **kwargs) result = (self._constructor(new_values, index=new_index, fastpath=True).__finalize__(self)) diff --git a/pandas/tests/categorical/conftest.py b/pandas/tests/categorical/conftest.py new file mode 100644 index 0000000000000..274389d484995 --- /dev/null +++ b/pandas/tests/categorical/conftest.py @@ -0,0 +1,13 @@ +import pytest + + +@pytest.fixture(params=[True, False]) +def allow_fill(request): + """Boolean 'allow_fill' parameter for Categorical.take""" + return request.param + + +@pytest.fixture(params=[True, False]) +def ordered(request): + """Boolean 'ordered' parameter for Categorical.""" + return request.param diff --git a/pandas/tests/categorical/test_algos.py b/pandas/tests/categorical/test_algos.py index 1c68377786dd4..dcf2081ae32fe 100644 --- a/pandas/tests/categorical/test_algos.py +++ b/pandas/tests/categorical/test_algos.py @@ -69,3 +69,45 @@ def test_isin_empty(empty): result = s.isin(empty) tm.assert_numpy_array_equal(expected, result) + + +class TestTake(object): + # https://github.com/pandas-dev/pandas/issues/20664 + + def test_take_warns(self): + cat = pd.Categorical(['a', 'b']) + with tm.assert_produces_warning(FutureWarning): + cat.take([0, -1]) + + def test_take_positive_no_warning(self): + cat = pd.Categorical(['a', 'b']) + with tm.assert_produces_warning(None): + cat.take([0, 0]) + + def test_take_bounds(self, allow_fill): + # https://github.com/pandas-dev/pandas/issues/20664 + cat = pd.Categorical(['a', 'b', 'a']) + with pytest.raises(IndexError): + cat.take([4, 5], allow_fill=allow_fill) + + def test_take_empty(self, allow_fill): + # https://github.com/pandas-dev/pandas/issues/20664 + cat = pd.Categorical([], categories=['a', 'b']) + with pytest.raises(IndexError): + cat.take([0], allow_fill=allow_fill) + + def test_positional_take(self, ordered): + cat = pd.Categorical(['a', 'a', 'b', 'b'], categories=['b', 'a'], + ordered=ordered) + result = cat.take([0, 1, 2], allow_fill=False) + expected = pd.Categorical(['a', 'a', 'b'], categories=cat.categories, + ordered=ordered) + tm.assert_categorical_equal(result, expected) + + def test_positional_take_unobserved(self, ordered): + cat = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'], + ordered=ordered) + result = cat.take([1, 0], allow_fill=False) + expected = pd.Categorical(['b', 'a'], categories=cat.categories, + ordered=ordered) + tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/extension/category/test_categorical.py b/pandas/tests/extension/category/test_categorical.py index c34339c99322d..8685819f369b9 100644 --- a/pandas/tests/extension/category/test_categorical.py +++ b/pandas/tests/extension/category/test_categorical.py @@ -111,7 +111,7 @@ def test_take_non_na_fill_value(self): def test_take_out_of_bounds_raises(self): pass - @skip_take + @pytest.mark.skip(reason="GH-20747. Unobserved categories.") def test_take_series(self): pass diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 1fdb7298eefc4..b7ac8033f3f6d 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -32,7 +32,7 @@ def data(): while len(data[0]) == len(data[1]): data = make_data() - return JSONArray(make_data()) + return JSONArray(data) @pytest.fixture diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 5cc1a8ff1c451..8571fbc10e9bb 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -753,6 +753,16 @@ def test_take(): s.take([-1, 3, 4], convert=False) +def test_take_categorical(): + # https://github.com/pandas-dev/pandas/issues/20664 + s = Series(pd.Categorical(['a', 'b', 'c'])) + result = s.take([-2, -2, 0]) + expected = Series(pd.Categorical(['b', 'b', 'a'], + categories=['a', 'b', 'c']), + index=[1, 1, 0]) + assert_series_equal(result, expected) + + def test_head_tail(test_data): assert_series_equal(test_data.series.head(), test_data.series[:5]) assert_series_equal(test_data.series.head(0), test_data.series[0:0])