From 06b52ab9917e96208f1a98841a245fd20e60dbb7 Mon Sep 17 00:00:00 2001 From: Scorpil Date: Fri, 27 Jul 2018 17:37:46 +0200 Subject: [PATCH 1/5] TST: get_dummies UnicodeEncodeError tests --- pandas/tests/reshape/test_reshape.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 295801f3e8def..1dbe8f2ef7440 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -302,6 +302,26 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): expected.sort_index(axis=1) assert_frame_equal(result, expected) + def test_dataframe_dummies_unicode(self): + df = pd.DataFrame(({u'ä': ['a']})) + result = get_dummies(df) + expected = pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8) + assert_frame_equal(result, expected) + + df = pd.DataFrame({'x': [u'ä']}) + result = pd.get_dummies(df) + expected = pd.DataFrame({u'x_ä': [1]}, dtype=np.uint8) + assert_frame_equal(result, expected) + + df = pd.DataFrame({'x': ['a']}) + result = pd.get_dummies(df, prefix=u'ä') + expected = pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8) + assert_frame_equal(result, expected) + + result = pd.get_dummies(df, prefix_sep=u'ä') + expected = pd.DataFrame({u'xäa': [1]}, dtype=np.uint8) + assert_frame_equal(result, expected) + def test_basic_drop_first(self, sparse): # GH12402 Add a new parameter `drop_first` to avoid collinearity # Basic case From a26b3c5cfa053829bc54546f09b46b3045672463 Mon Sep 17 00:00:00 2001 From: Scorpil Date: Mon, 30 Jul 2018 17:52:34 +0200 Subject: [PATCH 2/5] BUG: fix Unicode error in get_dummies for Python 2 --- pandas/core/reshape/reshape.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f9ab813855f47..6765c1bea4052 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,6 +1,6 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0703,W0622,W0613,W0201 -from pandas.compat import range, text_type, zip +from pandas.compat import range, text_type, zip, u from pandas import compat from functools import partial import itertools @@ -923,11 +923,17 @@ def get_empty_Frame(data, sparse): number_of_cols = len(levels) + py2_prefix_sep_is_unicode = isinstance(prefix_sep, text_type) if prefix is not None: - dummy_strs = [u'{prefix}{sep}{level}' if isinstance(v, text_type) - else '{prefix}{sep}{level}' for v in levels] - dummy_cols = [dummy_str.format(prefix=prefix, sep=prefix_sep, level=v) - for dummy_str, v in zip(dummy_strs, levels)] + py2_prefix_is_unicode = isinstance(prefix, text_type) + dummy_cols = [] + for level in levels: + fstr = '{prefix}{sep}{level}' + if py2_prefix_sep_is_unicode or py2_prefix_is_unicode or \ + isinstance(level, text_type): + fstr = u(fstr) + dummy_cols.append(fstr.format( + prefix=prefix, sep=prefix_sep, level=level)) else: dummy_cols = levels From 20589d97391dd49c8ac3dd28a57924ae262c4744 Mon Sep 17 00:00:00 2001 From: Scorpil Date: Mon, 30 Jul 2018 19:51:13 +0200 Subject: [PATCH 3/5] DOC: whatsnew entry --- doc/source/whatsnew/v0.24.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 8a92db4c66fb5..b7692f51c4ddf 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -660,6 +660,7 @@ Reshaping - Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) - Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`) - :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) +- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) - Build Changes From 15f2946fe4be5a4615411a059fb1c81e7ef0af52 Mon Sep 17 00:00:00 2001 From: Scorpil Date: Thu, 2 Aug 2018 14:37:49 +0200 Subject: [PATCH 4/5] CLN: parametrize test, codestyle update --- pandas/core/reshape/reshape.py | 28 ++++++++++++----------- pandas/tests/reshape/test_reshape.py | 34 +++++++++++++--------------- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6765c1bea4052..fa888c5967e7d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,6 +1,6 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0703,W0622,W0613,W0201 -from pandas.compat import range, text_type, zip, u +from pandas.compat import range, text_type, zip, u, PY2 from pandas import compat from functools import partial import itertools @@ -923,19 +923,21 @@ def get_empty_Frame(data, sparse): number_of_cols = len(levels) - py2_prefix_sep_is_unicode = isinstance(prefix_sep, text_type) - if prefix is not None: - py2_prefix_is_unicode = isinstance(prefix, text_type) - dummy_cols = [] - for level in levels: - fstr = '{prefix}{sep}{level}' - if py2_prefix_sep_is_unicode or py2_prefix_is_unicode or \ - isinstance(level, text_type): - fstr = u(fstr) - dummy_cols.append(fstr.format( - prefix=prefix, sep=prefix_sep, level=level)) - else: + if prefix is None: dummy_cols = levels + else: + def _make_col_name(prefix, prefix_sep, level): + fstr = '{prefix}{prefix_sep}{level}' + if PY2 and (isinstance(prefix, text_type) or + isinstance(prefix_sep, text_type) or + isinstance(level, text_type)): + fstr = u(fstr) + return fstr.format(prefix=prefix, + prefix_sep=prefix_sep, + level=level) + + dummy_cols = [_make_col_name(prefix, prefix_sep, level) + for level in levels] if isinstance(data, Series): index = data.index diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 1dbe8f2ef7440..3f4ccd7693a8f 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -302,24 +302,22 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): expected.sort_index(axis=1) assert_frame_equal(result, expected) - def test_dataframe_dummies_unicode(self): - df = pd.DataFrame(({u'ä': ['a']})) - result = get_dummies(df) - expected = pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8) - assert_frame_equal(result, expected) - - df = pd.DataFrame({'x': [u'ä']}) - result = pd.get_dummies(df) - expected = pd.DataFrame({u'x_ä': [1]}, dtype=np.uint8) - assert_frame_equal(result, expected) - - df = pd.DataFrame({'x': ['a']}) - result = pd.get_dummies(df, prefix=u'ä') - expected = pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8) - assert_frame_equal(result, expected) - - result = pd.get_dummies(df, prefix_sep=u'ä') - expected = pd.DataFrame({u'xäa': [1]}, dtype=np.uint8) + @pytest.mark.parametrize('get_dummies_kwargs,expected', [ + ({'data': pd.DataFrame(({u'ä': ['a']}))}, + pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)), + + ({'data': pd.DataFrame({'x': [u'ä']})}, + pd.DataFrame({u'x_ä': [1]}, dtype=np.uint8)), + + ({'data': pd.DataFrame({'x': [u'a']}), 'prefix':u'ä'}, + pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)), + + ({'data': pd.DataFrame({'x': [u'a']}), 'prefix_sep':u'ä'}, + pd.DataFrame({u'xäa': [1]}, dtype=np.uint8))]) + def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected): + # GH22084 pd.get_dummies incorrectly encodes unicode characters + # in dataframe column names + result = get_dummies(**get_dummies_kwargs) assert_frame_equal(result, expected) def test_basic_drop_first(self, sparse): From 662cac346af2559b76f94f52d7fd96db48c14716 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Thu, 2 Aug 2018 13:20:26 -0400 Subject: [PATCH 5/5] comment --- pandas/core/reshape/reshape.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index fa888c5967e7d..bd5ce4897e9da 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -926,6 +926,8 @@ def get_empty_Frame(data, sparse): if prefix is None: dummy_cols = levels else: + + # PY2 embedded unicode, gh-22084 def _make_col_name(prefix, prefix_sep, level): fstr = '{prefix}{prefix_sep}{level}' if PY2 and (isinstance(prefix, text_type) or