From c7bb13f19825aa4c50cd422bf383b48975a0f939 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sun, 15 Oct 2017 17:31:44 +0900 Subject: [PATCH 1/3] BUG: Fix wrong column selection in drop_duplicates when duplicate column names --- pandas/core/frame.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 97943f153319b..0341c60670aa0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3556,7 +3556,8 @@ def f(vals): isinstance(subset, tuple) and subset in self.columns): subset = subset, - vals = (self[col].values for col in subset) + vals = (col.values for name, col in self.iteritems() + if name in subset) labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) From 41dc08fb7ab32f32cff6b8be89734dc42e51651c Mon Sep 17 00:00:00 2001 From: Licht-T Date: Sun, 15 Oct 2017 17:35:23 +0900 Subject: [PATCH 2/3] TST: Add test of drop_duplicates with duplicate column names --- pandas/tests/frame/test_analytics.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index c36b5957a4283..1bac4037e99c9 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1394,6 +1394,21 @@ def test_drop_duplicates(self): for keep in ['first', 'last', False]: assert df.duplicated(keep=keep).sum() == 0 + def test_drop_duplicates_with_duplicate_column_names(self): + # GH17836 + df = DataFrame([ + [1, 2, 5], + [3, 4, 6], + [3, 4, 7] + ], columns=['a', 'a', 'b']) + + result0 = df.drop_duplicates() + tm.assert_frame_equal(result0, df) + + result1 = df.drop_duplicates('a') + expected1 = df[:2] + tm.assert_frame_equal(result1, expected1) + def test_drop_duplicates_for_take_all(self): df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar', 'foo', 'bar', 'qux', 'foo'], From 30c7e1b0bf8d651ef47bf5ce7c17db2f3b76452a Mon Sep 17 00:00:00 2001 From: Licht-T Date: Mon, 16 Oct 2017 00:46:41 +0900 Subject: [PATCH 3/3] DOC: Add whatsnew note for fixing the bug of drop_duplicates --- doc/source/whatsnew/v0.21.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 117e7c9d11259..09217b9cb81be 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -1006,6 +1006,7 @@ Reshaping - Bug in :func:`concat` where order of result index was unpredictable if it contained non-comparable elements (:issue:`17344`) - Fixes regression when sorting by multiple columns on a ``datetime64`` dtype ``Series`` with ``NaT`` values (:issue:`16836`) - Bug in :func:`pivot_table` where the result's columns did not preserve the categorical dtype of ``columns`` when ``dropna`` was ``False`` (:issue:`17842`) +- Bug in ``DataFrame.drop_duplicates`` where dropping with non-unique column names raised a ``ValueError`` (:issue:`17836`) Numeric ^^^^^^^