From e54c734556ac46fa085d65861fd411190747358c Mon Sep 17 00:00:00 2001 From: Chris Billington Date: Thu, 27 Jul 2017 13:48:55 -0400 Subject: [PATCH 1/5] Allow unicode empty strings to be used as placeholders in multilevel column names in Python 2 --- pandas/core/frame.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e546e96f253c7..586d4318dd3df 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2112,8 +2112,9 @@ def _getitem_multilevel(self, key): result = result.__finalize__(self) if len(result.columns) == 1: top = result.columns[0] - if ((type(top) == str and top == '') or - (type(top) == tuple and top[0] == '')): + if isinstance(top, tuple): + top = top[0] + if top == '': result = result[''] if isinstance(result, Series): result = self._constructor_sliced(result, From a798b70e6d540d71d2bd7f23369b73be365f61e5 Mon Sep 17 00:00:00 2001 From: Chris Billington Date: Thu, 27 Jul 2017 15:47:40 -0400 Subject: [PATCH 2/5] added test for unicode multilevel column placeholders --- pandas/tests/test_multilevel.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 0b2dc9ba70f03..f61078a00286e 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1694,6 +1694,28 @@ def test_mixed_depth_get(self): tm.assert_series_equal(result, expected, check_names=False) assert result.name == ('routine1', 'result1') + def test_mixed_depth_get_unicode_placeholders_py2(self): + # Note this is only different to test_mixed_depth_get() on Python 2 + arrays = [[u('a'), u('top'), u('top'), + u('routine1'), u('routine1'), u('routine2')], + [u(''), u('OD'), u('OD'), + u('result1'), u('result2'), u('result1')], + [u(''), u('wx'), u('wy'), u(''), u(''), u('')]] + + tuples = sorted(zip(*arrays)) + index = MultiIndex.from_tuples(tuples) + df = DataFrame(randn(4, 6), columns=index) + + result = df['a'] + expected = df['a', '', ''] + tm.assert_series_equal(result, expected, check_names=False) + assert result.name == 'a' + + result = df['routine1', 'result1'] + expected = df['routine1', 'result1', ''] + tm.assert_series_equal(result, expected, check_names=False) + assert result.name == ('routine1', 'result1') + def test_mixed_depth_insert(self): arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], ['', 'OD', 'OD', 'result1', 'result2', 'result1'], From 63071bd5e1597655bd3c3d39ecc8f5dab49fb774 Mon Sep 17 00:00:00 2001 From: Chris Billington Date: Thu, 27 Jul 2017 22:23:52 -0400 Subject: [PATCH 3/5] Added whatsnew note, modfied tests for mixed level columns based on pull request review --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/tests/test_multilevel.py | 25 ++++++++++++------------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 0025f8d098d81..5a5146a74375c 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -273,6 +273,7 @@ Indexing - Fixes bug where indexing with ``np.inf`` caused an ``OverflowError`` to be raised (:issue:`16957`) - Bug in reindexing on an empty ``CategoricalIndex`` (:issue:`16770`) - Fixes ``DataFrame.loc`` for setting with alignment and tz-aware ``DatetimeIndex`` (:issue:`16889`) +- Fixes getting a column by name when unicode empty strings are used as placeholders in multilevel columns in Python 2 (:issue:`17099`) I/O ^^^ diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index f61078a00286e..4c8976cc9324c 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1682,20 +1682,20 @@ def test_mixed_depth_get(self): tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) - df = DataFrame(randn(4, 6), columns=index) + df = DataFrame(np.random.randn(4, 6), columns=index) result = df['a'] - expected = df['a', '', ''] - tm.assert_series_equal(result, expected, check_names=False) - assert result.name == 'a' + expected = df['a', '', ''].rename('a') + tm.assert_series_equal(result, expected) result = df['routine1', 'result1'] expected = df['routine1', 'result1', ''] - tm.assert_series_equal(result, expected, check_names=False) - assert result.name == ('routine1', 'result1') + expected = expected.rename(('routine1', 'result1')) + tm.assert_series_equal(result, expected) def test_mixed_depth_get_unicode_placeholders_py2(self): - # Note this is only different to test_mixed_depth_get() on Python 2 + # Pull request #17099. This is only different to + # test_mixed_depth_get() on Python 2 arrays = [[u('a'), u('top'), u('top'), u('routine1'), u('routine1'), u('routine2')], [u(''), u('OD'), u('OD'), @@ -1704,17 +1704,16 @@ def test_mixed_depth_get_unicode_placeholders_py2(self): tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) - df = DataFrame(randn(4, 6), columns=index) + df = DataFrame(np.random.randn(4, 6), columns=index) result = df['a'] - expected = df['a', '', ''] - tm.assert_series_equal(result, expected, check_names=False) - assert result.name == 'a' + expected = df['a', '', ''].rename('a') + tm.assert_series_equal(result, expected) result = df['routine1', 'result1'] expected = df['routine1', 'result1', ''] - tm.assert_series_equal(result, expected, check_names=False) - assert result.name == ('routine1', 'result1') + expected = expected.rename(('routine1', 'result1')) + tm.assert_series_equal(result, expected) def test_mixed_depth_insert(self): arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], From e7c9f97bb52cd9eb47602313981677d28756f9d1 Mon Sep 17 00:00:00 2001 From: Chris Billington Date: Sat, 29 Jul 2017 13:22:33 -0400 Subject: [PATCH 4/5] Dedup test, added descriptive comment to _getitem_multilevel(). --- pandas/core/frame.py | 7 +++++++ pandas/tests/test_multilevel.py | 31 ++++++++++--------------------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 586d4318dd3df..d5f1a4845cd42 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2110,6 +2110,13 @@ def _getitem_multilevel(self, key): result = self._constructor(new_values, index=self.index, columns=result_columns) result = result.__finalize__(self) + + # If there is only one column being returned, and its name is + # either an empty string, or a tuple with an empty string as its + # first element, then treat the empty string as a placeholder + # and return the column as if the user had provided that empty + # string in the key. If the result is a Series, exclude the + # implied empty string from its name. if len(result.columns) == 1: top = result.columns[0] if isinstance(top, tuple): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 4c8976cc9324c..dca8fdef3f29f 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1675,11 +1675,18 @@ def test_int_series_slicing(self): expected = self.ymd.reindex(s.index[5:]) tm.assert_frame_equal(result, expected) - def test_mixed_depth_get(self): + def test_mixed_depth_get(self, unicode_strings_py2=False): + # If unicode_strings_py2 is True, then the column labels in dataframe + # construction will use unicode strings in Python 2. In Python 3 they + # are unicode strings regardless. + arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], ['', 'OD', 'OD', 'result1', 'result2', 'result1'], ['', 'wx', 'wy', '', '', '']] + if unicode_strings_py2: + arrays = [[u(s) for s in arr] for arr in arrays] + tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) df = DataFrame(np.random.randn(4, 6), columns=index) @@ -1694,26 +1701,8 @@ def test_mixed_depth_get(self): tm.assert_series_equal(result, expected) def test_mixed_depth_get_unicode_placeholders_py2(self): - # Pull request #17099. This is only different to - # test_mixed_depth_get() on Python 2 - arrays = [[u('a'), u('top'), u('top'), - u('routine1'), u('routine1'), u('routine2')], - [u(''), u('OD'), u('OD'), - u('result1'), u('result2'), u('result1')], - [u(''), u('wx'), u('wy'), u(''), u(''), u('')]] - - tuples = sorted(zip(*arrays)) - index = MultiIndex.from_tuples(tuples) - df = DataFrame(np.random.randn(4, 6), columns=index) - - result = df['a'] - expected = df['a', '', ''].rename('a') - tm.assert_series_equal(result, expected) - - result = df['routine1', 'result1'] - expected = df['routine1', 'result1', ''] - expected = expected.rename(('routine1', 'result1')) - tm.assert_series_equal(result, expected) + # Pull request #17099. + self.test_mixed_depth_get(unicode_strings_py2=True) def test_mixed_depth_insert(self): arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], From f1edf68bb99c7d7c9afeb3a76e8ecd97e6be4199 Mon Sep 17 00:00:00 2001 From: Chris Billington Date: Thu, 10 Aug 2017 00:54:31 -0400 Subject: [PATCH 5/5] parametrize test, simplified whatnew note --- doc/source/whatsnew/v0.21.0.txt | 2 +- pandas/tests/test_multilevel.py | 15 ++++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index ec3dad0c02b28..f601c4e8a321b 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -323,7 +323,7 @@ Indexing - Bug in reindexing on an empty ``CategoricalIndex`` (:issue:`16770`) - Fixes ``DataFrame.loc`` for setting with alignment and tz-aware ``DatetimeIndex`` (:issue:`16889`) - Avoids ``IndexError`` when passing an Index or Series to ``.iloc`` with older numpy (:issue:`17193`) -- Fixes getting a column by name when unicode empty strings are used as placeholders in multilevel columns in Python 2 (:issue:`17099`) +- Allow unicode empty strings as placeholders in multilevel columns in Python 2 (:issue:`17099`) I/O ^^^ diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index dca8fdef3f29f..a765e2c4ca1bf 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1675,16 +1675,17 @@ def test_int_series_slicing(self): expected = self.ymd.reindex(s.index[5:]) tm.assert_frame_equal(result, expected) - def test_mixed_depth_get(self, unicode_strings_py2=False): - # If unicode_strings_py2 is True, then the column labels in dataframe - # construction will use unicode strings in Python 2. In Python 3 they - # are unicode strings regardless. + @pytest.mark.parametrize('unicode_strings', [True, False]) + def test_mixed_depth_get(self, unicode_strings): + # If unicode_strings is True, the column labels in dataframe + # construction will use unicode strings in Python 2 (pull request + # #17099). arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], ['', 'OD', 'OD', 'result1', 'result2', 'result1'], ['', 'wx', 'wy', '', '', '']] - if unicode_strings_py2: + if unicode_strings: arrays = [[u(s) for s in arr] for arr in arrays] tuples = sorted(zip(*arrays)) @@ -1700,10 +1701,6 @@ def test_mixed_depth_get(self, unicode_strings_py2=False): expected = expected.rename(('routine1', 'result1')) tm.assert_series_equal(result, expected) - def test_mixed_depth_get_unicode_placeholders_py2(self): - # Pull request #17099. - self.test_mixed_depth_get(unicode_strings_py2=True) - def test_mixed_depth_insert(self): arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], ['', 'OD', 'OD', 'result1', 'result2', 'result1'],