Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: Fix get dummies unicode error #22131

Merged
merged 5 commits into from
Aug 2, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,7 @@ Reshaping
- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`)
- Bug in :meth:`DataFrame.replace` raises RecursionError when converting OutOfBounds ``datetime64[ns, tz]`` (:issue:`20380`)
- :func:`pandas.core.groupby.GroupBy.rank` now raises a ``ValueError`` when an invalid value is passed for argument ``na_option`` (:issue:`22124`)
- Bug in :func:`get_dummies` with Unicode attributes in Python 2 (:issue:`22084`)
-

Build Changes
Expand Down
24 changes: 17 additions & 7 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# pylint: disable=E1101,E1103
# pylint: disable=W0703,W0622,W0613,W0201
from pandas.compat import range, text_type, zip
from pandas.compat import range, text_type, zip, u, PY2
from pandas import compat
from functools import partial
import itertools
Expand Down Expand Up @@ -923,13 +923,23 @@ def get_empty_Frame(data, sparse):

number_of_cols = len(levels)

if prefix is not None:
dummy_strs = [u'{prefix}{sep}{level}' if isinstance(v, text_type)
else '{prefix}{sep}{level}' for v in levels]
dummy_cols = [dummy_str.format(prefix=prefix, sep=prefix_sep, level=v)
for dummy_str, v in zip(dummy_strs, levels)]
else:
if prefix is None:
dummy_cols = levels
else:

# PY2 embedded unicode, gh-22084
def _make_col_name(prefix, prefix_sep, level):
fstr = '{prefix}{prefix_sep}{level}'
if PY2 and (isinstance(prefix, text_type) or
isinstance(prefix_sep, text_type) or
isinstance(level, text_type)):
fstr = u(fstr)
return fstr.format(prefix=prefix,
prefix_sep=prefix_sep,
level=level)

dummy_cols = [_make_col_name(prefix, prefix_sep, level)
for level in levels]

if isinstance(data, Series):
index = data.index
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/reshape/test_reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,6 +302,24 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
expected.sort_index(axis=1)
assert_frame_equal(result, expected)

@pytest.mark.parametrize('get_dummies_kwargs,expected', [
({'data': pd.DataFrame(({u'ä': ['a']}))},
pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)),

({'data': pd.DataFrame({'x': [u'ä']})},
pd.DataFrame({u'x_ä': [1]}, dtype=np.uint8)),

({'data': pd.DataFrame({'x': [u'a']}), 'prefix':u'ä'},
pd.DataFrame({u'ä_a': [1]}, dtype=np.uint8)),

({'data': pd.DataFrame({'x': [u'a']}), 'prefix_sep':u'ä'},
pd.DataFrame({u'xäa': [1]}, dtype=np.uint8))])
def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
# GH22084 pd.get_dummies incorrectly encodes unicode characters
# in dataframe column names
result = get_dummies(**get_dummies_kwargs)
assert_frame_equal(result, expected)

def test_basic_drop_first(self, sparse):
# GH12402 Add a new parameter `drop_first` to avoid collinearity
# Basic case
Expand Down