From 7f440270fd210144bdd036cc98807623db000a0f Mon Sep 17 00:00:00 2001 From: MeeseeksMachine <39504233+meeseeksmachine@users.noreply.github.com> Date: Thu, 7 Feb 2019 18:57:54 -0800 Subject: [PATCH] Backport PR #25182: BUG: Fix Series.is_unique with single occurrence of NaN (#25223) --- doc/source/whatsnew/v0.24.2.rst | 2 +- pandas/core/base.py | 2 +- pandas/tests/indexes/common.py | 21 +++++++++++++++++++ .../tests/indexes/interval/test_interval.py | 19 ++++------------- pandas/tests/indexes/multi/test_duplicates.py | 12 +++++++++++ pandas/tests/indexes/period/test_indexing.py | 12 ----------- pandas/tests/indexes/test_category.py | 9 -------- pandas/tests/series/test_duplicates.py | 18 ++++++++++------ 8 files changed, 51 insertions(+), 44 deletions(-) diff --git a/doc/source/whatsnew/v0.24.2.rst b/doc/source/whatsnew/v0.24.2.rst index 6a9a316da1ec6..73df504c89d5b 100644 --- a/doc/source/whatsnew/v0.24.2.rst +++ b/doc/source/whatsnew/v0.24.2.rst @@ -87,7 +87,7 @@ Bug Fixes **Other** -- +- Bug in :meth:`Series.is_unique` where single occurrences of ``NaN`` were not considered unique (:issue:`25180`) - - diff --git a/pandas/core/base.py b/pandas/core/base.py index c02ba88ea7fda..79b869c51d48f 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1345,7 +1345,7 @@ def is_unique(self): ------- is_unique : boolean """ - return self.nunique() == len(self) + return self.nunique(dropna=False) == len(self) @property def is_monotonic(self): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index e0259b83a8768..24207da416bb1 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -905,3 +905,24 @@ def test_astype_category(self, copy, name, ordered): result = index.astype('category', copy=copy) expected = CategoricalIndex(index.values, name=name) tm.assert_index_equal(result, expected) + + def test_is_unique(self): + # initialize a unique index + index = self.create_index().drop_duplicates() + assert index.is_unique is True + + # empty index should be unique + index_empty = index[:0] + assert index_empty.is_unique is True + + # test basic dupes + index_dup = index.insert(0, index[0]) + assert index_dup.is_unique is False + + # single NA should be unique + index_na = index.insert(0, np.nan) + assert index_na.is_unique is True + + # multiple NA should not be unique + index_na_dup = index_na.insert(0, np.nan) + assert index_na_dup.is_unique is False diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py index f1fd06c9cab6e..e4f25ff143273 100644 --- a/pandas/tests/indexes/interval/test_interval.py +++ b/pandas/tests/indexes/interval/test_interval.py @@ -242,12 +242,10 @@ def test_take(self, closed): [0, 0, 1], [1, 1, 2], closed=closed) tm.assert_index_equal(result, expected) - def test_unique(self, closed): - # unique non-overlapping - idx = IntervalIndex.from_tuples( - [(0, 1), (2, 3), (4, 5)], closed=closed) - assert idx.is_unique is True - + def test_is_unique_interval(self, closed): + """ + Interval specific tests for is_unique in addition to base class tests + """ # unique overlapping - distinct endpoints idx = IntervalIndex.from_tuples([(0, 1), (0.5, 1.5)], closed=closed) assert idx.is_unique is True @@ -261,15 +259,6 @@ def test_unique(self, closed): idx = IntervalIndex.from_tuples([(-1, 1), (-2, 2)], closed=closed) assert idx.is_unique is True - # duplicate - idx = IntervalIndex.from_tuples( - [(0, 1), (0, 1), (2, 3)], closed=closed) - assert idx.is_unique is False - - # empty - idx = IntervalIndex([], closed=closed) - assert idx.is_unique is True - def test_monotonic(self, closed): # increasing non-overlapping idx = IntervalIndex.from_tuples( diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py index af15026de2b34..35034dc57b4b8 100644 --- a/pandas/tests/indexes/multi/test_duplicates.py +++ b/pandas/tests/indexes/multi/test_duplicates.py @@ -143,6 +143,18 @@ def test_has_duplicates(idx, idx_dup): assert mi.is_unique is False assert mi.has_duplicates is True + # single instance of NaN + mi_nan = MultiIndex(levels=[['a', 'b'], [0, 1]], + codes=[[-1, 0, 0, 1, 1], [-1, 0, 1, 0, 1]]) + assert mi_nan.is_unique is True + assert mi_nan.has_duplicates is False + + # multiple instances of NaN + mi_nan_dup = MultiIndex(levels=[['a', 'b'], [0, 1]], + codes=[[-1, -1, 0, 0, 1, 1], [-1, -1, 0, 1, 0, 1]]) + assert mi_nan_dup.is_unique is False + assert mi_nan_dup.has_duplicates is True + def test_has_duplicates_from_tuples(): # GH 9075 diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py index 47c2edfd13395..d6ce4d5e3576e 100644 --- a/pandas/tests/indexes/period/test_indexing.py +++ b/pandas/tests/indexes/period/test_indexing.py @@ -441,18 +441,6 @@ def test_is_monotonic_decreasing(self): assert idx_dec1.is_monotonic_decreasing is True assert idx.is_monotonic_decreasing is False - def test_is_unique(self): - # GH 17717 - p0 = pd.Period('2017-09-01') - p1 = pd.Period('2017-09-02') - p2 = pd.Period('2017-09-03') - - idx0 = pd.PeriodIndex([p0, p1, p2]) - assert idx0.is_unique is True - - idx1 = pd.PeriodIndex([p1, p1, p2]) - assert idx1.is_unique is False - def test_contains(self): # GH 17717 p0 = pd.Period('2017-09-01') diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 582d466c6178e..d889135160ae2 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -611,15 +611,6 @@ def test_is_monotonic(self, data, non_lexsorted_data): assert c.is_monotonic_increasing is True assert c.is_monotonic_decreasing is False - @pytest.mark.parametrize('values, expected', [ - ([1, 2, 3], True), - ([1, 3, 1], False), - (list('abc'), True), - (list('aba'), False)]) - def test_is_unique(self, values, expected): - ci = CategoricalIndex(values) - assert ci.is_unique is expected - def test_has_duplicates(self): idx = CategoricalIndex([0, 0, 0], name='foo') diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index fe47975711a17..a975edacc19c7 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -59,12 +59,18 @@ def test_unique_data_ownership(): Series(Series(["a", "c", "b"]).unique()).sort_values() -def test_is_unique(): - # GH11946 - s = Series(np.random.randint(0, 10, size=1000)) - assert s.is_unique is False - s = Series(np.arange(1000)) - assert s.is_unique is True +@pytest.mark.parametrize('data, expected', [ + (np.random.randint(0, 10, size=1000), False), + (np.arange(1000), True), + ([], True), + ([np.nan], True), + (['foo', 'bar', np.nan], True), + (['foo', 'foo', np.nan], False), + (['foo', 'bar', np.nan, np.nan], False)]) +def test_is_unique(data, expected): + # GH11946 / GH25180 + s = Series(data) + assert s.is_unique is expected def test_is_unique_class_ne(capsys):