Skip to content

Commit

Permalink
BUG: Fix get dummies unicode error (pandas-dev#22131)
Browse files Browse the repository at this point in the history
  • Loading branch information
Scorpil authored and victor committed Sep 30, 2018
1 parent fc9de76 commit 3caed45
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 7 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,7 @@ Reshaping
- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`)
- Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`)
- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`)
- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`)
-

Build Changes
Expand Down
24 changes: 17 additions & 7 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# pylint: disable=E1101,E1103
# pylint: disable=W0703,W0622,W0613,W0201
from pandas.compat import range, text_type, zip
from pandas.compat import range, text_type, zip, u, PY2
from pandas import compat
from functools import partial
import itertools
Expand Down Expand Up @@ -923,13 +923,23 @@ def get_empty_Frame(data, sparse):

number_of_cols = len(levels)

if prefix is not None:
dummy_strs = [u'{prefix}{sep}{level}' if isinstance(v, text_type)
else '{prefix}{sep}{level}' for v in levels]
dummy_cols = [dummy_str.format(prefix=prefix, sep=prefix_sep, level=v)
for dummy_str, v in zip(dummy_strs, levels)]
else:
if prefix is None:
dummy_cols = levels
else:

# PY2 embedded unicode, gh-22084
def _make_col_name(prefix, prefix_sep, level):
fstr = '{prefix}{prefix_sep}{level}'
if PY2 and (isinstance(prefix, text_type) or
isinstance(prefix_sep, text_type) or
isinstance(level, text_type)):
fstr = u(fstr)
return fstr.format(prefix=prefix,
prefix_sep=prefix_sep,
level=level)

dummy_cols = [_make_col_name(prefix, prefix_sep, level)
for level in levels]

if isinstance(data, Series):
index = data.index
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/reshape/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,24 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
expected.sort_index(axis=1)
assert_frame_equal(result, expected)

@pytest.mark.parametrize('get_dummies_kwargs,expected', [
({'data': pd.DataFrame(({u'ä': ['a']}))},
pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)),
({'data': pd.DataFrame({'x': [u'ä']})},
pd.DataFrame({u'x_ä': [1]}, dtype=np.uint8)),
({'data': pd.DataFrame({'x': [u'a']}), 'prefix':u'ä'},
pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)),
({'data': pd.DataFrame({'x': [u'a']}), 'prefix_sep':u'ä'},
pd.DataFrame({u'xäa': [1]}, dtype=np.uint8))])
def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
# GH22084 pd.get_dummies incorrectly encodes unicode characters
# in dataframe column names
result = get_dummies(**get_dummies_kwargs)
assert_frame_equal(result, expected)

def test_basic_drop_first(self, sparse):
# GH12402 Add a new parameter `drop_first` to avoid collinearity
# Basic case
Expand Down

0 comments on commit 3caed45

Please sign in to comment.