From 421dcf423f3979e82e538dbd1dd9008bef36ddef Mon Sep 17 00:00:00 2001 From: Chris Billington Date: Thu, 10 Aug 2017 06:36:50 -0400 Subject: [PATCH] Bugfix for multilevel columns with empty strings in Python 2 (#17099) --- doc/source/whatsnew/v0.21.0.txt | 1 + pandas/core/frame.py | 12 ++++++++++-- pandas/tests/test_multilevel.py | 21 ++++++++++++++------- 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index ac0e960a348b6..2f2089b4f8ad7 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -324,6 +324,7 @@ Indexing - Bug in reindexing on an empty ``CategoricalIndex`` (:issue:`16770`) - Fixes ``DataFrame.loc`` for setting with alignment and tz-aware ``DatetimeIndex`` (:issue:`16889`) - Avoids ``IndexError`` when passing an Index or Series to ``.iloc`` with older numpy (:issue:`17193`) +- Allow unicode empty strings as placeholders in multilevel columns in Python 2 (:issue:`17099`) I/O ^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 027a427555253..94cce1b4d05b5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2134,10 +2134,18 @@ def _getitem_multilevel(self, key): result = self._constructor(new_values, index=self.index, columns=result_columns) result = result.__finalize__(self) + + # If there is only one column being returned, and its name is + # either an empty string, or a tuple with an empty string as its + # first element, then treat the empty string as a placeholder + # and return the column as if the user had provided that empty + # string in the key. If the result is a Series, exclude the + # implied empty string from its name. if len(result.columns) == 1: top = result.columns[0] - if ((type(top) == str and top == '') or - (type(top) == tuple and top[0] == '')): + if isinstance(top, tuple): + top = top[0] + if top == '': result = result[''] if isinstance(result, Series): result = self._constructor_sliced(result, diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 0b2dc9ba70f03..a765e2c4ca1bf 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1675,24 +1675,31 @@ def test_int_series_slicing(self): expected = self.ymd.reindex(s.index[5:]) tm.assert_frame_equal(result, expected) - def test_mixed_depth_get(self): + @pytest.mark.parametrize('unicode_strings', [True, False]) + def test_mixed_depth_get(self, unicode_strings): + # If unicode_strings is True, the column labels in dataframe + # construction will use unicode strings in Python 2 (pull request + # #17099). + arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'], ['', 'OD', 'OD', 'result1', 'result2', 'result1'], ['', 'wx', 'wy', '', '', '']] + if unicode_strings: + arrays = [[u(s) for s in arr] for arr in arrays] + tuples = sorted(zip(*arrays)) index = MultiIndex.from_tuples(tuples) - df = DataFrame(randn(4, 6), columns=index) + df = DataFrame(np.random.randn(4, 6), columns=index) result = df['a'] - expected = df['a', '', ''] - tm.assert_series_equal(result, expected, check_names=False) - assert result.name == 'a' + expected = df['a', '', ''].rename('a') + tm.assert_series_equal(result, expected) result = df['routine1', 'result1'] expected = df['routine1', 'result1', ''] - tm.assert_series_equal(result, expected, check_names=False) - assert result.name == ('routine1', 'result1') + expected = expected.rename(('routine1', 'result1')) + tm.assert_series_equal(result, expected) def test_mixed_depth_insert(self): arrays = [['a', 'top', 'top', 'routine1', 'routine1', 'routine2'],