From 60fe82c8a2829e831d28cf6d4b3595637c3c5802 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Wed, 25 Apr 2018 15:38:17 +0300 Subject: [PATCH] PERF: GH2003 Series.isin for categorical dtypes (#20522) --- asv_bench/benchmarks/categoricals.py | 21 ++++++++++ doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/algorithms.py | 7 ++++ pandas/core/arrays/categorical.py | 56 ++++++++++++++++++++++++++ pandas/core/indexes/base.py | 2 +- pandas/core/series.py | 2 +- pandas/tests/categorical/test_algos.py | 22 ++++++++++ 7 files changed, 109 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 7743921003353..0ffd5f881d626 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -148,3 +148,24 @@ def time_rank_int_cat(self): def time_rank_int_cat_ordered(self): self.s_int_cat_ordered.rank() + + +class Isin(object): + + goal_time = 0.2 + + params = ['object', 'int64'] + param_names = ['dtype'] + + def setup(self, dtype): + np.random.seed(1234) + n = 5 * 10**5 + sample_size = 100 + arr = [i for i in np.random.randint(0, n // 10, size=n)] + if dtype == 'object': + arr = ['s%04d' % i for i in arr] + self.sample = np.random.choice(arr, sample_size) + self.series = pd.Series(arr).astype('category') + + def time_isin_categorical(self, dtype): + self.series.isin(self.sample) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 43e384b01ad2c..2b73a84810045 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -954,6 +954,7 @@ Performance Improvements - Improved performance of :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` (:issue:`11296`) - Improved performance of :func:`pandas.core.groupby.GroupBy.any` and :func:`pandas.core.groupby.GroupBy.all` (:issue:`15435`) - Improved performance of :func:`pandas.core.groupby.GroupBy.pct_change` (:issue:`19165`) +- Improved performance of :func:`Series.isin` in the case of categorical dtypes (:issue:`20003`) - Fixed a performance regression for :func:`GroupBy.nth` and :func:`GroupBy.last` with some object columns (:issue:`19283`) .. _whatsnew_0230.docs: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 065a5782aced1..5493348334223 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -407,6 +407,13 @@ def isin(comps, values): if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) + if is_categorical_dtype(comps): + # TODO(extension) + # handle categoricals + return comps._values.isin(values) + + comps = com._values_from_object(comps) + comps, dtype, _ = _ensure_data(comps) values, _, _ = _ensure_data(values, dtype=dtype) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 599161521f3a7..7f0d54de9def8 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -39,6 +39,8 @@ from pandas.util._decorators import ( Appender, cache_readonly, deprecate_kwarg, Substitution) +import pandas.core.algorithms as algorithms + from pandas.io.formats.terminal import get_terminal_size from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs from pandas.core.config import get_option @@ -2216,6 +2218,60 @@ def _concat_same_type(self, to_concat): def _formatting_values(self): return self + def isin(self, values): + """ + Check whether `values` are contained in Categorical. + + Return a boolean NumPy Array showing whether each element in + the Categorical matches an element in the passed sequence of + `values` exactly. + + Parameters + ---------- + values : set or list-like + The sequence of values to test. Passing in a single string will + raise a ``TypeError``. Instead, turn a single string into a + list of one element. + + Returns + ------- + isin : numpy.ndarray (bool dtype) + + Raises + ------ + TypeError + * If `values` is not a set or list-like + + See Also + -------- + pandas.Series.isin : equivalent method on Series + + Examples + -------- + + >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama', + ... 'hippo']) + >>> s.isin(['cow', 'lama']) + array([ True, True, True, False, True, False]) + + Passing a single string as ``s.isin('lama')`` will raise an error. Use + a list of one element instead: + + >>> s.isin(['lama']) + array([ True, False, True, False, True, False]) + """ + from pandas.core.series import _sanitize_array + if not is_list_like(values): + raise TypeError("only list-like objects are allowed to be passed" + " to isin(), you passed a [{values_type}]" + .format(values_type=type(values).__name__)) + values = _sanitize_array(values, None, None) + null_mask = np.asarray(isna(values)) + code_values = self.categories.get_indexer(values) + code_values = code_values[null_mask | (code_values >= 0)] + return algorithms.isin(self.codes, code_values) + + # The Series.cat accessor diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 3d60eefc5b598..21006c4831ac5 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3516,7 +3516,7 @@ def isin(self, values, level=None): """ if level is not None: self._validate_index_level(level) - return algos.isin(np.array(self), values) + return algos.isin(self, values) def _can_reindex(self, indexer): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index aa4cb510feb62..f2ee225f50514 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3567,7 +3567,7 @@ def isin(self, values): 5 False Name: animal, dtype: bool """ - result = algorithms.isin(com._values_from_object(self), values) + result = algorithms.isin(self, values) return self._constructor(result, index=self.index).__finalize__(self) def between(self, left, right, inclusive=True): diff --git a/pandas/tests/categorical/test_algos.py b/pandas/tests/categorical/test_algos.py index f727184e862d8..1c68377786dd4 100644 --- a/pandas/tests/categorical/test_algos.py +++ b/pandas/tests/categorical/test_algos.py @@ -47,3 +47,25 @@ def test_factorized_sort_ordered(): tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_categorical_equal(uniques, expected_uniques) + + +def test_isin_cats(): + # GH2003 + cat = pd.Categorical(["a", "b", np.nan]) + + result = cat.isin(["a", np.nan]) + expected = np.array([True, False, True], dtype=bool) + tm.assert_numpy_array_equal(expected, result) + + result = cat.isin(["a", "c"]) + expected = np.array([True, False, False], dtype=bool) + tm.assert_numpy_array_equal(expected, result) + + +@pytest.mark.parametrize("empty", [[], pd.Series(), np.array([])]) +def test_isin_empty(empty): + s = pd.Categorical(["a", "b"]) + expected = np.array([False, False], dtype=bool) + + result = s.isin(empty) + tm.assert_numpy_array_equal(expected, result)