Skip to content

Commit

Permalink
PERF: GH2003 Series.isin for categorical dtypes (#20522)
Browse files Browse the repository at this point in the history
  • Loading branch information
bourbaki authored and jreback committed Apr 25, 2018
1 parent 7ec74e5 commit 60fe82c
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 2 deletions.
21 changes: 21 additions & 0 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,3 +148,24 @@ def time_rank_int_cat(self):

def time_rank_int_cat_ordered(self):
self.s_int_cat_ordered.rank()


class Isin(object):

goal_time = 0.2

params = ['object', 'int64']
param_names = ['dtype']

def setup(self, dtype):
np.random.seed(1234)
n = 5 * 10**5
sample_size = 100
arr = [i for i in np.random.randint(0, n // 10, size=n)]
if dtype == 'object':
arr = ['s%04d' % i for i in arr]
self.sample = np.random.choice(arr, sample_size)
self.series = pd.Series(arr).astype('category')

def time_isin_categorical(self, dtype):
self.series.isin(self.sample)
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,7 @@ Performance Improvements
- Improved performance of :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` (:issue:`11296`)
- Improved performance of :func:`pandas.core.groupby.GroupBy.any` and :func:`pandas.core.groupby.GroupBy.all` (:issue:`15435`)
- Improved performance of :func:`pandas.core.groupby.GroupBy.pct_change` (:issue:`19165`)
- Improved performance of :func:`Series.isin` in the case of categorical dtypes (:issue:`20003`)
- Fixed a performance regression for :func:`GroupBy.nth` and :func:`GroupBy.last` with some object columns (:issue:`19283`)

.. _whatsnew_0230.docs:
Expand Down
7 changes: 7 additions & 0 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,13 @@ def isin(comps, values):
if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)):
values = construct_1d_object_array_from_listlike(list(values))

if is_categorical_dtype(comps):
# TODO(extension)
# handle categoricals
return comps._values.isin(values)

comps = com._values_from_object(comps)

comps, dtype, _ = _ensure_data(comps)
values, _, _ = _ensure_data(values, dtype=dtype)

Expand Down
56 changes: 56 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
from pandas.util._decorators import (
Appender, cache_readonly, deprecate_kwarg, Substitution)

import pandas.core.algorithms as algorithms

from pandas.io.formats.terminal import get_terminal_size
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
from pandas.core.config import get_option
Expand Down Expand Up @@ -2216,6 +2218,60 @@ def _concat_same_type(self, to_concat):
def _formatting_values(self):
return self

def isin(self, values):
"""
Check whether `values` are contained in Categorical.
Return a boolean NumPy Array showing whether each element in
the Categorical matches an element in the passed sequence of
`values` exactly.
Parameters
----------
values : set or list-like
The sequence of values to test. Passing in a single string will
raise a ``TypeError``. Instead, turn a single string into a
list of one element.
Returns
-------
isin : numpy.ndarray (bool dtype)
Raises
------
TypeError
* If `values` is not a set or list-like
See Also
--------
pandas.Series.isin : equivalent method on Series
Examples
--------
>>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',
... 'hippo'])
>>> s.isin(['cow', 'lama'])
array([ True, True, True, False, True, False])
Passing a single string as ``s.isin('lama')`` will raise an error. Use
a list of one element instead:
>>> s.isin(['lama'])
array([ True, False, True, False, True, False])
"""
from pandas.core.series import _sanitize_array
if not is_list_like(values):
raise TypeError("only list-like objects are allowed to be passed"
" to isin(), you passed a [{values_type}]"
.format(values_type=type(values).__name__))
values = _sanitize_array(values, None, None)
null_mask = np.asarray(isna(values))
code_values = self.categories.get_indexer(values)
code_values = code_values[null_mask | (code_values >= 0)]
return algorithms.isin(self.codes, code_values)


# The Series.cat accessor


Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3516,7 +3516,7 @@ def isin(self, values, level=None):
"""
if level is not None:
self._validate_index_level(level)
return algos.isin(np.array(self), values)
return algos.isin(self, values)

def _can_reindex(self, indexer):
"""
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3567,7 +3567,7 @@ def isin(self, values):
5 False
Name: animal, dtype: bool
"""
result = algorithms.isin(com._values_from_object(self), values)
result = algorithms.isin(self, values)
return self._constructor(result, index=self.index).__finalize__(self)

def between(self, left, right, inclusive=True):
Expand Down
22 changes: 22 additions & 0 deletions pandas/tests/categorical/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,25 @@ def test_factorized_sort_ordered():

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_categorical_equal(uniques, expected_uniques)


def test_isin_cats():
# GH2003
cat = pd.Categorical(["a", "b", np.nan])

result = cat.isin(["a", np.nan])
expected = np.array([True, False, True], dtype=bool)
tm.assert_numpy_array_equal(expected, result)

result = cat.isin(["a", "c"])
expected = np.array([True, False, False], dtype=bool)
tm.assert_numpy_array_equal(expected, result)


@pytest.mark.parametrize("empty", [[], pd.Series(), np.array([])])
def test_isin_empty(empty):
s = pd.Categorical(["a", "b"])
expected = np.array([False, False], dtype=bool)

result = s.isin(empty)
tm.assert_numpy_array_equal(expected, result)

0 comments on commit 60fe82c

Please sign in to comment.