diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 8a92db4c66fb5..b7692f51c4ddf 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -660,6 +660,7 @@ Reshaping - Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) - Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`) - :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`) +- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`) - Build Changes diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f9ab813855f47..bd5ce4897e9da 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -1,6 +1,6 @@ # pylint: disable=E1101,E1103 # pylint: disable=W0703,W0622,W0613,W0201 -from pandas.compat import range, text_type, zip +from pandas.compat import range, text_type, zip, u, PY2 from pandas import compat from functools import partial import itertools @@ -923,13 +923,23 @@ def get_empty_Frame(data, sparse): number_of_cols = len(levels) - if prefix is not None: - dummy_strs = [u'{prefix}{sep}{level}' if isinstance(v, text_type) - else '{prefix}{sep}{level}' for v in levels] - dummy_cols = [dummy_str.format(prefix=prefix, sep=prefix_sep, level=v) - for dummy_str, v in zip(dummy_strs, levels)] - else: + if prefix is None: dummy_cols = levels + else: + + # PY2 embedded unicode, gh-22084 + def _make_col_name(prefix, prefix_sep, level): + fstr = '{prefix}{prefix_sep}{level}' + if PY2 and (isinstance(prefix, text_type) or + isinstance(prefix_sep, text_type) or + isinstance(level, text_type)): + fstr = u(fstr) + return fstr.format(prefix=prefix, + prefix_sep=prefix_sep, + level=level) + + dummy_cols = [_make_col_name(prefix, prefix_sep, level) + for level in levels] if isinstance(data, Series): index = data.index diff --git a/pandas/tests/reshape/test_reshape.py b/pandas/tests/reshape/test_reshape.py index 295801f3e8def..3f4ccd7693a8f 100644 --- a/pandas/tests/reshape/test_reshape.py +++ b/pandas/tests/reshape/test_reshape.py @@ -302,6 +302,24 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): expected.sort_index(axis=1) assert_frame_equal(result, expected) + @pytest.mark.parametrize('get_dummies_kwargs,expected', [ + ({'data': pd.DataFrame(({u'ä': ['a']}))}, + pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)), + + ({'data': pd.DataFrame({'x': [u'ä']})}, + pd.DataFrame({u'x_ä': [1]}, dtype=np.uint8)), + + ({'data': pd.DataFrame({'x': [u'a']}), 'prefix':u'ä'}, + pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)), + + ({'data': pd.DataFrame({'x': [u'a']}), 'prefix_sep':u'ä'}, + pd.DataFrame({u'xäa': [1]}, dtype=np.uint8))]) + def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected): + # GH22084 pd.get_dummies incorrectly encodes unicode characters + # in dataframe column names + result = get_dummies(**get_dummies_kwargs) + assert_frame_equal(result, expected) + def test_basic_drop_first(self, sparse): # GH12402 Add a new parameter `drop_first` to avoid collinearity # Basic case