diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 6765c1bea40524..fa888c5967e7da 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,6 +1,6 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0703,W0622,W0613,W0201 -from pandas.compat import range, text_type, zip, u +from pandas.compat import range, text_type, zip, u, PY2 from pandas import compat from functools import partial import itertools @@ -923,19 +923,21 @@ def get_empty_Frame(data, sparse): number_of_cols = len(levels) - py2_prefix_sep_is_unicode = isinstance(prefix_sep, text_type) - if prefix is not None: - py2_prefix_is_unicode = isinstance(prefix, text_type) - dummy_cols = [] - for level in levels: - fstr = '{prefix}{sep}{level}' - if py2_prefix_sep_is_unicode or py2_prefix_is_unicode or \ - isinstance(level, text_type): - fstr = u(fstr) - dummy_cols.append(fstr.format( - prefix=prefix, sep=prefix_sep, level=level)) - else: + if prefix is None: dummy_cols = levels + else: + def _make_col_name(prefix, prefix_sep, level): + fstr = '{prefix}{prefix_sep}{level}' + if PY2 and (isinstance(prefix, text_type) or + isinstance(prefix_sep, text_type) or + isinstance(level, text_type)): + fstr = u(fstr) + return fstr.format(prefix=prefix, + prefix_sep=prefix_sep, + level=level) + + dummy_cols = [_make_col_name(prefix, prefix_sep, level) + for level in levels] if isinstance(data, Series): index = data.index diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 1dbe8f2ef7440a..3f4ccd7693a8f6 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -302,24 +302,22 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): expected.sort_index(axis=1) assert_frame_equal(result, expected) - def test_dataframe_dummies_unicode(self): - df = pd.DataFrame(({u'ä': ['a']})) - result = get_dummies(df) - expected = pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8) - assert_frame_equal(result, expected) - - df = pd.DataFrame({'x': [u'ä']}) - result = pd.get_dummies(df) - expected = pd.DataFrame({u'x_ä': [1]}, dtype=np.uint8) - assert_frame_equal(result, expected) - - df = pd.DataFrame({'x': ['a']}) - result = pd.get_dummies(df, prefix=u'ä') - expected = pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8) - assert_frame_equal(result, expected) - - result = pd.get_dummies(df, prefix_sep=u'ä') - expected = pd.DataFrame({u'xäa': [1]}, dtype=np.uint8) + @pytest.mark.parametrize('get_dummies_kwargs,expected', [ + ({'data': pd.DataFrame(({u'ä': ['a']}))}, + pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)), + + ({'data': pd.DataFrame({'x': [u'ä']})}, + pd.DataFrame({u'x_ä': [1]}, dtype=np.uint8)), + + ({'data': pd.DataFrame({'x': [u'a']}), 'prefix':u'ä'}, + pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)), + + ({'data': pd.DataFrame({'x': [u'a']}), 'prefix_sep':u'ä'}, + pd.DataFrame({u'xäa': [1]}, dtype=np.uint8))]) + def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected): + # GH22084 pd.get_dummies incorrectly encodes unicode characters + # in dataframe column names + result = get_dummies(**get_dummies_kwargs) assert_frame_equal(result, expected) def test_basic_drop_first(self, sparse):