From 19ac11a25cc4239b419612d71670171a59b82ff7 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Mon, 26 Mar 2018 23:56:38 +0300 Subject: [PATCH 01/18] PERF: GH2003 Series.isin for categorical dtypes --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/algorithms.py | 11 +++++++++-- pandas/core/series.py | 6 +++++- pandas/tests/series/test_analytics.py | 11 +++++++++++ 4 files changed, 26 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 9159c03edee2e..3955d318e6fa7 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -345,6 +345,7 @@ Other Enhancements ``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`) - :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`) - zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`) +- Performance enhancement for :func:`Series.isin` in the case of categorical dtypes (:issue:`20003`) .. _whatsnew_0230.api_breaking: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index de2e638265f1e..00089e40cb0b2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -403,8 +403,15 @@ def isin(comps, values): if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) - comps, dtype, _ = _ensure_data(comps) - values, _, _ = _ensure_data(values, dtype=dtype) + if not is_categorical_dtype(comps): + comps, dtype, _ = _ensure_data(comps) + values, _, _ = _ensure_data(values, dtype=dtype) + else: + cats = comps.cat.categories + comps = comps.cat.codes.values + mask = isna(values) + values = cats.get_indexer(values) + values = values[mask | (values >= 0)] # faster for larger cases to use np.in1d f = lambda x, y: htable.ismember_object(x, values) diff --git a/pandas/core/series.py b/pandas/core/series.py index da598259d272d..be9b70c620302 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3507,7 +3507,11 @@ def isin(self, values): 5 False Name: animal, dtype: bool """ - result = algorithms.isin(com._values_from_object(self), values) + if is_categorical_dtype(self.dtype): + result = algorithms.isin(self, values) + else: + result = algorithms.isin(com._values_from_object(self), values) + return self._constructor(result, index=self.index).__finalize__(self) def between(self, left, right, inclusive=True): diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 0e6e44e839464..b997039c54902 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1255,6 +1255,17 @@ def test_isin_empty(self, empty): result = s.isin(empty) tm.assert_series_equal(expected, result) + def test_isin_cats(self): + s = Series(["a", "b", np.nan]).astype("category") + + result = s.isin(["a", np.nan]) + expected = Series([True, False, True]) + tm.assert_series_equal(expected, result) + + result = s.isin(["a", "c"]) + expected = Series([True, False, False]) + tm.assert_series_equal(expected, result) + def test_timedelta64_analytics(self): from pandas import date_range From 54021b9adc1a2756e267aedc8e5cf8172caf5d48 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Fri, 30 Mar 2018 23:38:16 +0300 Subject: [PATCH 02/18] Add Categorical.isin method --- pandas/core/algorithms.py | 11 ++--------- pandas/core/arrays/categorical.py | 11 +++++++++++ pandas/core/series.py | 5 ++--- pandas/tests/categorical/test_algos.py | 21 +++++++++++++++++++++ pandas/tests/series/test_analytics.py | 11 ----------- 5 files changed, 36 insertions(+), 23 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 00089e40cb0b2..de2e638265f1e 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -403,15 +403,8 @@ def isin(comps, values): if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) - if not is_categorical_dtype(comps): - comps, dtype, _ = _ensure_data(comps) - values, _, _ = _ensure_data(values, dtype=dtype) - else: - cats = comps.cat.categories - comps = comps.cat.codes.values - mask = isna(values) - values = cats.get_indexer(values) - values = values[mask | (values >= 0)] + comps, dtype, _ = _ensure_data(comps) + values, _, _ = _ensure_data(values, dtype=dtype) # faster for larger cases to use np.in1d f = lambda x, y: htable.ismember_object(x, values) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6eadef37da344..205b80ccd859b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -40,6 +40,8 @@ from pandas.util._decorators import ( Appender, cache_readonly, deprecate_kwarg, Substitution) +import pandas.core.algorithms as algorithms + from pandas.io.formats.terminal import get_terminal_size from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs from pandas.core.config import get_option @@ -2261,6 +2263,15 @@ def _concat_same_type(self, to_concat): def _formatting_values(self): return self + def isin(self, values): + from pandas.core.series import _sanitize_array + values = _sanitize_array(values, None, None) + null_mask = isna(values) + code_values = self.categories.get_indexer(values) + code_values = code_values[null_mask | (code_values >= 0)] + return algorithms.isin(self.codes, code_values) + + # The Series.cat accessor diff --git a/pandas/core/series.py b/pandas/core/series.py index be9b70c620302..56a3bc8bc4e1e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3507,11 +3507,10 @@ def isin(self, values): 5 False Name: animal, dtype: bool """ - if is_categorical_dtype(self.dtype): - result = algorithms.isin(self, values) + if is_categorical_dtype(self): + result = self._values.isin(values) else: result = algorithms.isin(com._values_from_object(self), values) - return self._constructor(result, index=self.index).__finalize__(self) def between(self, left, right, inclusive=True): diff --git a/pandas/tests/categorical/test_algos.py b/pandas/tests/categorical/test_algos.py index 61764ec0ff632..109d97b898a90 100644 --- a/pandas/tests/categorical/test_algos.py +++ b/pandas/tests/categorical/test_algos.py @@ -47,3 +47,24 @@ def test_factorized_sort_ordered(): tm.assert_numpy_array_equal(labels, expected_labels) tm.assert_categorical_equal(uniques, expected_uniques) + + +def test_isin_cats(): + cat = pd.Categorical(["a", "b", np.nan]) + + result = cat.isin(["a", np.nan]) + expected = np.array([True, False, True], dtype=bool) + tm.assert_numpy_array_equal(expected, result) + + result = cat.isin(["a", "c"]) + expected = np.array([True, False, False], dtype=bool) + tm.assert_numpy_array_equal(expected, result) + + +@pytest.mark.parametrize("empty", [[], pd.Series(), np.array([])]) +def test_isin_empty(empty): + s = pd.Categorical(["a", "b"]) + expected = np.array([False, False], dtype=bool) + + result = s.isin(empty) + tm.assert_numpy_array_equal(expected, result) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index b997039c54902..0e6e44e839464 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1255,17 +1255,6 @@ def test_isin_empty(self, empty): result = s.isin(empty) tm.assert_series_equal(expected, result) - def test_isin_cats(self): - s = Series(["a", "b", np.nan]).astype("category") - - result = s.isin(["a", np.nan]) - expected = Series([True, False, True]) - tm.assert_series_equal(expected, result) - - result = s.isin(["a", "c"]) - expected = Series([True, False, False]) - tm.assert_series_equal(expected, result) - def test_timedelta64_analytics(self): from pandas import date_range From 2514b45c583b73c97694334e094d9af199a8ffa7 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Fri, 30 Mar 2018 23:45:45 +0300 Subject: [PATCH 03/18] Add benchmark --- asv_bench/benchmarks/categoricals.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 7743921003353..ef6e1aee7fc5f 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -148,3 +148,18 @@ def time_rank_int_cat(self): def time_rank_int_cat_ordered(self): self.s_int_cat_ordered.rank() + + +class IsIn(object): + + goal_time = 0.2 + + def setup(self): + n = 5 * 10**5 + sample_size = 100 + arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] + self.sample = np.random.choice(arr, sample_size) + self.ts = pd.Series(arr).astype('category') + + def time_set_categories(self): + self.ts.isin(self.sample) From 80f687a436634670ef73e85e89c048720abb1131 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Sat, 31 Mar 2018 00:46:40 +0300 Subject: [PATCH 04/18] Rename benchmark --- asv_bench/benchmarks/categoricals.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index ef6e1aee7fc5f..25268ac0cd10b 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -161,5 +161,5 @@ def setup(self): self.sample = np.random.choice(arr, sample_size) self.ts = pd.Series(arr).astype('category') - def time_set_categories(self): + def time_isin_categorical_strings(self): self.ts.isin(self.sample) From d6c39534651a5fedca537d0ce092e13cb20aae01 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Sat, 31 Mar 2018 00:47:27 +0300 Subject: [PATCH 05/18] change what's new --- doc/source/whatsnew/v0.23.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 3955d318e6fa7..7b113bf2d5831 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -345,7 +345,6 @@ Other Enhancements ``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`) - :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`) - zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`) -- Performance enhancement for :func:`Series.isin` in the case of categorical dtypes (:issue:`20003`) .. _whatsnew_0230.api_breaking: @@ -803,6 +802,7 @@ Performance Improvements - Improved performance of :func:`pandas.core.groupby.GroupBy.ffill` and :func:`pandas.core.groupby.GroupBy.bfill` (:issue:`11296`) - Improved performance of :func:`pandas.core.groupby.GroupBy.any` and :func:`pandas.core.groupby.GroupBy.all` (:issue:`15435`) - Improved performance of :func:`pandas.core.groupby.GroupBy.pct_change` (:issue:`19165`) +- Improved performance of :func:`Series.isin` in the case of categorical dtypes (:issue:`20003`) .. _whatsnew_0230.docs: From ceffccd1a4f563d6087dfb5aa2a1a8c118010093 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Tue, 3 Apr 2018 01:32:59 +0300 Subject: [PATCH 06/18] rf: more generic check --- pandas/core/series.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index ec9e9522e1d8b..647f4cc019c67 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3564,7 +3564,7 @@ def isin(self, values): 5 False Name: animal, dtype: bool """ - if is_categorical_dtype(self): + if hasattr(self._values, 'isin'): result = self._values.isin(values) else: result = algorithms.isin(com._values_from_object(self), values) From 3247dce814377881150e8e05bfeb5a4899692ae0 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Thu, 5 Apr 2018 19:41:27 +0300 Subject: [PATCH 07/18] Move the rest of isin logic to algorithms --- pandas/core/algorithms.py | 6 ++++++ pandas/core/indexes/base.py | 2 +- pandas/core/series.py | 5 +---- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 065a5782aced1..df34b3540fa3b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -407,6 +407,12 @@ def isin(comps, values): if not isinstance(values, (ABCIndex, ABCSeries, np.ndarray)): values = construct_1d_object_array_from_listlike(list(values)) + if is_categorical_dtype(comps): + # handle categoricals + return comps._values.isin(values) + + comps = com._values_from_object(comps) + comps, dtype, _ = _ensure_data(comps) values, _, _ = _ensure_data(values, dtype=dtype) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 12bb09e8f8a8a..586190fbc65d8 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3487,7 +3487,7 @@ def isin(self, values, level=None): """ if level is not None: self._validate_index_level(level) - return algos.isin(np.array(self), values) + return algos.isin(self, values) def _can_reindex(self, indexer): """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 647f4cc019c67..89916872651ac 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -3564,10 +3564,7 @@ def isin(self, values): 5 False Name: animal, dtype: bool """ - if hasattr(self._values, 'isin'): - result = self._values.isin(values) - else: - result = algorithms.isin(com._values_from_object(self), values) + result = algorithms.isin(self, values) return self._constructor(result, index=self.index).__finalize__(self) def between(self, left, right, inclusive=True): From 2b7b1c48e11ccb5d2e9ea1d6eeb9146e977c6aad Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Fri, 6 Apr 2018 01:23:30 +0300 Subject: [PATCH 08/18] Fix for null mask --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0163d151029c6..506d04519fe32 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2221,7 +2221,7 @@ def _formatting_values(self): def isin(self, values): from pandas.core.series import _sanitize_array values = _sanitize_array(values, None, None) - null_mask = isna(values) + null_mask = np.asarray(isna(values)) code_values = self.categories.get_indexer(values) code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) From 4478a49c6829a7afdfc7c240a57d34eff2c330c8 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Sat, 7 Apr 2018 12:30:54 +0300 Subject: [PATCH 09/18] Add docs and raise error on non-list-like --- pandas/core/arrays/categorical.py | 44 +++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 506d04519fe32..6d998085ad175 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2219,6 +2219,50 @@ def _formatting_values(self): return self def isin(self, values): + """ + Check whether `values` are contained in Categorical. + + Return a boolean NumPy Array showing whether each element in the Categorical + matches an element in the passed sequence of `values` exactly. + + Parameters + ---------- + values : set or list-like + The sequence of values to test. Passing in a single string will + raise a ``TypeError``. Instead, turn a single string into a + list of one element. + + Returns + ------- + isin : numpy.ndarray (bool dtype) + + Raises + ------ + TypeError + * If `values` is a string + + See Also + -------- + pandas.Series.isin : equivalent method on Series + + Examples + -------- + + >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama', + ... 'hippo']) + >>> s.isin(['cow', 'lama']) + array([ True, True, True, False, True, False]) + + Passing a single string as ``s.isin('lama')`` will raise an error. Use + a list of one element instead: + + >>> s.isin(['lama']) + array([ True, False, True, False, True, False]) + """ + if not is_list_like(values): + raise TypeError("only list-like objects are allowed to be passed" + " to isin(), you passed a [{values_type}]" + .format(values_type=type(values).__name__)) from pandas.core.series import _sanitize_array values = _sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) From 64fef493ce02fd2136996fa62643579ef06b6805 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Sat, 7 Apr 2018 12:46:39 +0300 Subject: [PATCH 10/18] fix doc line --- pandas/core/arrays/categorical.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6d998085ad175..baa73e9e4a1fc 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2222,8 +2222,9 @@ def isin(self, values): """ Check whether `values` are contained in Categorical. - Return a boolean NumPy Array showing whether each element in the Categorical - matches an element in the passed sequence of `values` exactly. + Return a boolean NumPy Array showing whether each element in + the Categorical matches an element in the passed sequence of + `values` exactly. Parameters ---------- From b25da12a69af24219fecfb1bf3dfdfde39e06d94 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Mon, 9 Apr 2018 02:58:48 +0300 Subject: [PATCH 11/18] refactor benchmark name and add reference to issue --- asv_bench/benchmarks/categoricals.py | 2 +- pandas/tests/categorical/test_algos.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 25268ac0cd10b..1889830ee17d4 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -150,7 +150,7 @@ def time_rank_int_cat_ordered(self): self.s_int_cat_ordered.rank() -class IsIn(object): +class Isin(object): goal_time = 0.2 diff --git a/pandas/tests/categorical/test_algos.py b/pandas/tests/categorical/test_algos.py index dcda226bfd23e..1c68377786dd4 100644 --- a/pandas/tests/categorical/test_algos.py +++ b/pandas/tests/categorical/test_algos.py @@ -50,6 +50,7 @@ def test_factorized_sort_ordered(): def test_isin_cats(): + # GH2003 cat = pd.Categorical(["a", "b", np.nan]) result = cat.isin(["a", np.nan]) From 9f8e7906f3d2f3a87182a3d5e940b9744c43626b Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Sun, 15 Apr 2018 20:21:03 +0300 Subject: [PATCH 12/18] add todo --- pandas/core/algorithms.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index df34b3540fa3b..5493348334223 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -408,6 +408,7 @@ def isin(comps, values): values = construct_1d_object_array_from_listlike(list(values)) if is_categorical_dtype(comps): + # TODO(extension) # handle categoricals return comps._values.isin(values) From 60ac65864330051eb2177edb28b18e95e6d4665d Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Sun, 15 Apr 2018 20:24:15 +0300 Subject: [PATCH 13/18] move import from the function to the top of the file --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index baa73e9e4a1fc..d2cc92ebaaab0 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -44,6 +44,7 @@ from pandas.io.formats.terminal import get_terminal_size from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs from pandas.core.config import get_option +from pandas.core.series import _sanitize_array from .base import ExtensionArray @@ -2264,7 +2265,6 @@ def isin(self, values): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a [{values_type}]" .format(values_type=type(values).__name__)) - from pandas.core.series import _sanitize_array values = _sanitize_array(values, None, None) null_mask = np.asarray(isna(values)) code_values = self.categories.get_indexer(values) From 50aca261b584b564f843877d34f65bb21069bcf0 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Sun, 15 Apr 2018 20:44:35 +0300 Subject: [PATCH 14/18] add int64 benchmark --- asv_bench/benchmarks/categoricals.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 1889830ee17d4..42906787ce32d 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -154,12 +154,20 @@ class Isin(object): goal_time = 0.2 - def setup(self): + params = ['int64', 'object'] + param_names = ['dtype'] + + def setup(self, dtype): n = 5 * 10**5 sample_size = 100 - arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] + if dtype == "int64": + arr = [i for i in np.random.randint(0, n // 10, size=n)] + else: + arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] + np.random.seed(1234) self.sample = np.random.choice(arr, sample_size) self.ts = pd.Series(arr).astype('category') - def time_isin_categorical_strings(self): + def time_isin_categorical(self): self.ts.isin(self.sample) + From 713712eac335891904e784b9844308c46627e8e6 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Sun, 15 Apr 2018 21:20:57 +0300 Subject: [PATCH 15/18] move import to the top of the function --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d2cc92ebaaab0..5a06e47ba395c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -44,7 +44,6 @@ from pandas.io.formats.terminal import get_terminal_size from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs from pandas.core.config import get_option -from pandas.core.series import _sanitize_array from .base import ExtensionArray @@ -2261,6 +2260,7 @@ def isin(self, values): >>> s.isin(['lama']) array([ True, False, True, False, True, False]) """ + from pandas.core.series import _sanitize_array if not is_list_like(values): raise TypeError("only list-like objects are allowed to be passed" " to isin(), you passed a [{values_type}]" From 18c827d391a9bd6841eb48cabf46eb2b88416464 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Sun, 15 Apr 2018 22:43:21 +0300 Subject: [PATCH 16/18] add int64 categorical test --- asv_bench/benchmarks/categoricals.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 42906787ce32d..1db3f783e1604 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -154,20 +154,19 @@ class Isin(object): goal_time = 0.2 - params = ['int64', 'object'] + params = ['object', 'int64'] param_names = ['dtype'] def setup(self, dtype): + np.random.seed(1234) n = 5 * 10**5 sample_size = 100 - if dtype == "int64": - arr = [i for i in np.random.randint(0, n // 10, size=n)] - else: - arr = ['s%04d' % i for i in np.random.randint(0, n // 10, size=n)] - np.random.seed(1234) + arr = [i for i in np.random.randint(0, n // 10, size=n)] + if dtype == 'object': + arr = ['s%04d' % i for i in arr] self.sample = np.random.choice(arr, sample_size) self.ts = pd.Series(arr).astype('category') - def time_isin_categorical(self): + def time_isin_categorical(self, dtype): self.ts.isin(self.sample) From a2b70ee12f4496d54ebee3cf6a6db76f4fbee6c5 Mon Sep 17 00:00:00 2001 From: Artem Bogachev Date: Wed, 18 Apr 2018 13:35:10 +0300 Subject: [PATCH 17/18] rename variable in benchmark --- asv_bench/benchmarks/categoricals.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 1db3f783e1604..04d25480f78a5 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -165,8 +165,8 @@ def setup(self, dtype): if dtype == 'object': arr = ['s%04d' % i for i in arr] self.sample = np.random.choice(arr, sample_size) - self.ts = pd.Series(arr).astype('category') + self.series = pd.Series(arr).astype('category') def time_isin_categorical(self, dtype): - self.ts.isin(self.sample) + self.series.isin(self.sample) From 7b680cd239a6150b42b7cf8554f38c5c3beec368 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 25 Apr 2018 06:01:44 -0400 Subject: [PATCH 18/18] whitespace --- asv_bench/benchmarks/categoricals.py | 1 - pandas/core/arrays/categorical.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 04d25480f78a5..0ffd5f881d626 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -169,4 +169,3 @@ def setup(self, dtype): def time_isin_categorical(self, dtype): self.series.isin(self.sample) - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 90656c15d9e37..7f0d54de9def8 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2240,7 +2240,7 @@ def isin(self, values): Raises ------ TypeError - * If `values` is a string + * If `values` is not a set or list-like See Also --------