From a1dfb037de79e6982a0e7ccf883e5af11e9cc843 Mon Sep 17 00:00:00 2001 From: Morgan Stuart Date: Tue, 18 Jul 2017 19:31:51 -0400 Subject: [PATCH] BUG: Large object array isin closes #16012 Author: Morgan Stuart Closes #16969 from Morgan243/large_array_isin and squashes the following commits: 31cb4b3 [Morgan Stuart] Removed unneeded details from whatsnew description 4b59745 [Morgan Stuart] Linting errors; additional test clarification 186607b [Morgan Stuart] BUG #16012 - fix isin for large object arrays --- doc/source/whatsnew/v0.21.0.txt | 5 +++-- pandas/core/algorithms.py | 5 ++++- pandas/tests/series/test_analytics.py | 12 ++++++++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 2259eb7d89534..c1133aee3b4a2 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -178,8 +178,6 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- Fixes regression in 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) -- Fixes bug where indexing with ``np.inf`` caused an ``OverflowError`` to be raised (:issue:`16957`) Conversion ^^^^^^^^^^ @@ -193,6 +191,7 @@ Indexing - Fixes regression in 0.20.3 when indexing with a string on a ``TimedeltaIndex`` (:issue:`16896`). - Fixed :func:`TimedeltaIndex.get_loc` handling of ``np.timedelta64`` inputs (:issue:`16909`). - Fix :func:`MultiIndex.sort_index` ordering when ``ascending`` argument is a list, but not all levels are specified, or are in a different order (:issue:`16934`). +- Fixes bug where indexing with ``np.inf`` caused an ``OverflowError`` to be raised (:issue:`16957`) I/O ^^^ @@ -222,6 +221,8 @@ Sparse Reshaping ^^^^^^^^^ - Joining/Merging with a non unique ``PeriodIndex`` raised a TypeError (:issue:`16871`) +- Bug when using :func:`isin` on a large object series and large comparison array (:issue:`16012`) +- Fixes regression from 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) Numeric diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index b490bf787a037..4ee2c54000fb6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -402,7 +402,10 @@ def isin(comps, values): # work-around for numpy < 1.8 and comparisions on py3 # faster for larger cases to use np.in1d f = lambda x, y: htable.ismember_object(x, values) - if (_np_version_under1p8 and compat.PY3) or len(comps) > 1000000: + # GH16012 + # Ensure np.in1d doesn't get object types or it *may* throw an exception + if ((_np_version_under1p8 and compat.PY3) or len(comps) > 1000000 and + not is_object_dtype(comps)): f = lambda x, y: np.in1d(x, y) elif is_integer_dtype(comps): try: diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 749af1c56a7f0..ab75dbf1b51cc 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -1092,6 +1092,18 @@ def test_isin(self): expected = Series([True, False, True, False, False, False, True, True]) assert_series_equal(result, expected) + # GH: 16012 + # This specific issue has to have a series over 1e6 in len, but the + # comparison array (in_list) must be large enough so that numpy doesn't + # do a manual masking trick that will avoid this issue altogether + s = Series(list('abcdefghijk' * 10 ** 5)) + # If numpy doesn't do the manual comparison/mask, these + # unorderable mixed types are what cause the exception in numpy + in_list = [-1, 'a', 'b', 'G', 'Y', 'Z', 'E', + 'K', 'E', 'S', 'I', 'R', 'R'] * 6 + + assert s.isin(in_list).sum() == 200000 + def test_isin_with_string_scalar(self): # GH4763 s = Series(['A', 'B', 'C', 'a', 'B', 'B', 'A', 'C'])