From df5bfcf623a181d8cfae1241f269f172deb8abc8 Mon Sep 17 00:00:00 2001 From: ri938 Date: Mon, 3 Jul 2017 21:10:28 +0100 Subject: [PATCH 1/8] BUG: reindex would throw when a categorical index was empty #16770 --- doc/source/whatsnew/v0.20.3.txt | 2 +- pandas/core/indexes/category.py | 9 +++++++-- pandas/tests/indexes/test_category.py | 8 ++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt index c730142450ea6..636f36d4fe3cc 100644 --- a/doc/source/whatsnew/v0.20.3.txt +++ b/doc/source/whatsnew/v0.20.3.txt @@ -37,7 +37,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - Fixed issue with dataframe scatter plot for categorical data that reports incorrect column key not found when categorical data is used for plotting (:issue:`16199`) - +- Handle reindexing an empty categorical index rather than throwing (:issue:`16770`) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d9e0c218bfafc..da3c25eb473c4 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -419,7 +419,11 @@ def reindex(self, target, method=None, level=None, limit=None, raise ValueError("cannot reindex with a non-unique indexer") indexer, missing = self.get_indexer_non_unique(np.array(target)) - new_target = self.take(indexer) + + if len(self.codes): + new_target = self.take(indexer) + else: + new_target = target # filling in missing if needed if len(missing): @@ -430,7 +434,8 @@ def reindex(self, target, method=None, level=None, limit=None, result = Index(np.array(self), name=self.name) new_target, indexer, _ = result._reindex_non_unique( np.array(target)) - + # see GH 16819, indexer needs to be converted to correct type + indexer = np.array(indexer, dtype=np.int64) else: codes = new_target.codes.copy() diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 4e4f9b29f9a4c..139ec49542abe 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -420,6 +420,14 @@ def test_reindex_dtype(self): tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.int64)) + def test_reindex_empty_index(self): + # See GH16770 + c = CategoricalIndex([]) + res, indexer = c.reindex(['a', 'b']) + tm.assert_index_equal(res, Index(['a', 'b']), exact=True) + tm.assert_numpy_array_equal(indexer, + np.array([-1, -1], dtype=np.int64)) + def test_duplicates(self): idx = CategoricalIndex([0, 0, 0], name='foo') From 6b5bd715ce26e426e1f6659382b7142ca27291e7 Mon Sep 17 00:00:00 2001 From: ri938 Date: Tue, 4 Jul 2017 17:56:30 +0100 Subject: [PATCH 2/8] BUG: get_indexer_not_unique inconsistent return types vs get_indexer #16819 --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/indexes/base.py | 2 +- pandas/tests/indexes/test_base.py | 11 +++++++++++ pandas/tests/indexes/test_category.py | 3 +-- 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index de2516d75040b..4e9dbe1b32ee5 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -91,6 +91,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in get_indexer_non_unique inconsistent return type with get_indexer (:issue:`16819`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 695f9f119baa2..91ff308579146 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2704,7 +2704,7 @@ def get_indexer_non_unique(self, target): tgt_values = target._values indexer, missing = self._engine.get_indexer_non_unique(tgt_values) - return Index(indexer), missing + return indexer, missing def get_indexer_for(self, target, **kwargs): """ diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 18dbe6624008a..7a81a125467d5 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -1131,6 +1131,17 @@ def test_get_indexer_strings(self): with pytest.raises(TypeError): idx.get_indexer(['a', 'b', 'c', 'd'], method='pad', tolerance=2) + def test_get_indexer_consistency(self): + # See GH 16819 + for name, index in self.indices.items(): + indexer = index.get_indexer(index[0:2]) + assert isinstance(indexer, np.ndarray) + assert indexer.dtype == np.intp + + indexer, _ = index.get_indexer_non_unique(index[0:2]) + assert isinstance(indexer, np.ndarray) + assert indexer.dtype == np.intp + def test_get_loc(self): idx = pd.Index([0, 1, 2]) all_methods = [None, 'pad', 'backfill', 'nearest'] diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 139ec49542abe..c6bfae2804adc 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -386,8 +386,7 @@ def test_reindexing(self): expected = oidx.get_indexer_non_unique(finder)[0] actual = ci.get_indexer(finder) - tm.assert_numpy_array_equal( - expected.values, actual, check_dtype=False) + tm.assert_numpy_array_equal(expected, actual, check_dtype=True) def test_reindex_dtype(self): c = CategoricalIndex(['a', 'b', 'c', 'a']) From e32df12c794a2adf6d4d342eb37d6adfba74da93 Mon Sep 17 00:00:00 2001 From: ri938 Date: Tue, 4 Jul 2017 23:45:39 +0100 Subject: [PATCH 3/8] Remove mistaken code which belonged on another branch (minor correction) --- doc/source/whatsnew/v0.20.3.txt | 1 - pandas/core/indexes/category.py | 8 +------- pandas/tests/indexes/test_category.py | 8 -------- 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt index 636f36d4fe3cc..e9bd5b04a5596 100644 --- a/doc/source/whatsnew/v0.20.3.txt +++ b/doc/source/whatsnew/v0.20.3.txt @@ -37,7 +37,6 @@ Performance Improvements Bug Fixes ~~~~~~~~~ - Fixed issue with dataframe scatter plot for categorical data that reports incorrect column key not found when categorical data is used for plotting (:issue:`16199`) -- Handle reindexing an empty categorical index rather than throwing (:issue:`16770`) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index da3c25eb473c4..cb7deba0415d4 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -419,11 +419,7 @@ def reindex(self, target, method=None, level=None, limit=None, raise ValueError("cannot reindex with a non-unique indexer") indexer, missing = self.get_indexer_non_unique(np.array(target)) - - if len(self.codes): - new_target = self.take(indexer) - else: - new_target = target + new_target = self.take(indexer) # filling in missing if needed if len(missing): @@ -434,8 +430,6 @@ def reindex(self, target, method=None, level=None, limit=None, result = Index(np.array(self), name=self.name) new_target, indexer, _ = result._reindex_non_unique( np.array(target)) - # see GH 16819, indexer needs to be converted to correct type - indexer = np.array(indexer, dtype=np.int64) else: codes = new_target.codes.copy() diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index c6bfae2804adc..40af125ed65b3 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -419,14 +419,6 @@ def test_reindex_dtype(self): tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.int64)) - def test_reindex_empty_index(self): - # See GH16770 - c = CategoricalIndex([]) - res, indexer = c.reindex(['a', 'b']) - tm.assert_index_equal(res, Index(['a', 'b']), exact=True) - tm.assert_numpy_array_equal(indexer, - np.array([-1, -1], dtype=np.int64)) - def test_duplicates(self): idx = CategoricalIndex([0, 0, 0], name='foo') From 7e650bde2e408fe244c8685344816d551882e423 Mon Sep 17 00:00:00 2001 From: ri938 Date: Tue, 4 Jul 2017 23:48:26 +0100 Subject: [PATCH 4/8] Minor removing blank lines inserted in minor corection --- doc/source/whatsnew/v0.20.3.txt | 1 + pandas/core/indexes/category.py | 1 + 2 files changed, 2 insertions(+) diff --git a/doc/source/whatsnew/v0.20.3.txt b/doc/source/whatsnew/v0.20.3.txt index e9bd5b04a5596..c730142450ea6 100644 --- a/doc/source/whatsnew/v0.20.3.txt +++ b/doc/source/whatsnew/v0.20.3.txt @@ -40,6 +40,7 @@ Bug Fixes + Conversion ^^^^^^^^^^ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index cb7deba0415d4..d9e0c218bfafc 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -430,6 +430,7 @@ def reindex(self, target, method=None, level=None, limit=None, result = Index(np.array(self), name=self.name) new_target, indexer, _ = result._reindex_non_unique( np.array(target)) + else: codes = new_target.codes.copy() From b7106263d7ef6631051eb5b52549f97da970c240 Mon Sep 17 00:00:00 2001 From: ri938 Date: Wed, 5 Jul 2017 23:24:37 +0100 Subject: [PATCH 5/8] Fix bigs caused by code assuming return type of get_indexer_non_unique is Index --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/indexes/base.py | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 4e9dbe1b32ee5..1b9ac11600d91 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -53,6 +53,7 @@ Backwards incompatible API changes - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) +- Index.get_indexer_non_unique() now returns a ndarray indexer rather than an Index; this is consistent with Index.get_indexer() (:issue:`16819`) .. _whatsnew_0210.api: @@ -91,7 +92,6 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- Bug in get_indexer_non_unique inconsistent return type with get_indexer (:issue:`16819`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 91ff308579146..fb83b00aecf7a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2256,15 +2256,15 @@ def intersection(self, other): indexer = indexer.take((indexer != -1).nonzero()[0]) except: # duplicates - indexer = Index(other._values).get_indexer_non_unique( - self._values)[0].unique() + indexer = algos.unique1d(Index(other._values).get_indexer_non_unique( + self._values)[0]) indexer = indexer[indexer != -1] taken = other.take(indexer) if self.name != other.name: taken.name = None return taken - + def difference(self, other): """ Return a new Index with elements from the index that are not in @@ -2942,7 +2942,6 @@ def _reindex_non_unique(self, target): else: # need to retake to have the same size as the indexer - indexer = indexer.values indexer[~check] = 0 # reset the new indexer to account for the new size From d3a77be8b21f48d844eee15bf65973416a102141 Mon Sep 17 00:00:00 2001 From: ri938 Date: Thu, 6 Jul 2017 11:31:12 +0100 Subject: [PATCH 6/8] Fix issues caused by code still assuming return type of get_indexer_non_unique is an Index --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/core/groupby.py | 4 ++-- pandas/core/indexes/base.py | 6 +++--- pandas/tests/indexes/test_category.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 1b9ac11600d91..5dad5af3b2794 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -53,7 +53,7 @@ Backwards incompatible API changes - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) -- Index.get_indexer_non_unique() now returns a ndarray indexer rather than an Index; this is consistent with Index.get_indexer() (:issue:`16819`) +- `Index.get_indexer_non_unique()` now returns a ndarray indexer rather than an `Index`; this is consistent with `Index.get_indexer()` (:issue:`16819`) .. _whatsnew_0210.api: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c4b3e25acae7e..8ee7d86401083 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -896,8 +896,8 @@ def reset_identity(values): # we can't reindex, so we resort to this # GH 14776 if isinstance(ax, MultiIndex) and not ax.is_unique: - result = result.take(result.index.get_indexer_for( - ax.values).unique(), axis=self.axis) + result = result.take(algorithms.unique1d(result.index.get_indexer_for( + ax.values)), axis=self.axis) else: result = result.reindex_axis(ax, axis=self.axis) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index fb83b00aecf7a..8a4878d9cfbcf 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -2256,15 +2256,15 @@ def intersection(self, other): indexer = indexer.take((indexer != -1).nonzero()[0]) except: # duplicates - indexer = algos.unique1d(Index(other._values).get_indexer_non_unique( - self._values)[0]) + indexer = algos.unique1d( + Index(other._values).get_indexer_non_unique(self._values)[0]) indexer = indexer[indexer != -1] taken = other.take(indexer) if self.name != other.name: taken.name = None return taken - + def difference(self, other): """ Return a new Index with elements from the index that are not in diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 40af125ed65b3..493274fff43e0 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -386,7 +386,7 @@ def test_reindexing(self): expected = oidx.get_indexer_non_unique(finder)[0] actual = ci.get_indexer(finder) - tm.assert_numpy_array_equal(expected, actual, check_dtype=True) + tm.assert_numpy_array_equal(expected, actual) def test_reindex_dtype(self): c = CategoricalIndex(['a', 'b', 'c', 'a']) From 05cb9d3bdc3bcf8991d4984cca0b4599e3375677 Mon Sep 17 00:00:00 2001 From: ri938 Date: Thu, 6 Jul 2017 11:39:04 +0100 Subject: [PATCH 7/8] Minor documentation change --- doc/source/whatsnew/v0.21.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 5dad5af3b2794..36f3db98a39b5 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -53,7 +53,7 @@ Backwards incompatible API changes - :func:`read_csv` now treats ``'n/a'`` strings as missing values by default (:issue:`16078`) - :class:`pandas.HDFStore`'s string representation is now faster and less detailed. For the previous behavior, use ``pandas.HDFStore.info()``. (:issue:`16503`). - Compression defaults in HDF stores now follow pytable standards. Default is no compression and if ``complib`` is missing and ``complevel`` > 0 ``zlib`` is used (:issue:`15943`) -- `Index.get_indexer_non_unique()` now returns a ndarray indexer rather than an `Index`; this is consistent with `Index.get_indexer()` (:issue:`16819`) +- ``Index.get_indexer_non_unique()`` now returns a ndarray indexer rather than an ``Index``; this is consistent with ``Index.get_indexer()`` (:issue:`16819`) .. _whatsnew_0210.api: From e1b85cb0b35b42c82872096e0127785b04d610e0 Mon Sep 17 00:00:00 2001 From: ri938 Date: Thu, 6 Jul 2017 12:24:16 +0100 Subject: [PATCH 8/8] Minor correction: line too long. --- pandas/core/groupby.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 8ee7d86401083..daf3381ae4e89 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -896,8 +896,9 @@ def reset_identity(values): # we can't reindex, so we resort to this # GH 14776 if isinstance(ax, MultiIndex) and not ax.is_unique: - result = result.take(algorithms.unique1d(result.index.get_indexer_for( - ax.values)), axis=self.axis) + indexer = algorithms.unique1d( + result.index.get_indexer_for(ax.values)) + result = result.take(indexer, axis=self.axis) else: result = result.reindex_axis(ax, axis=self.axis)