Skip to content

Commit

Permalink
Bug in pd.merge() when merge/join with multiple categorical columns
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Jun 28, 2017
1 parent 65a0e64 commit f8e4705
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 4 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.3.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ Sparse
Reshaping
^^^^^^^^^

- Bug in ``pd.merge()`` when merge/join with multiple categorical columns (:issue:`16767`)


Numeric
Expand Down
9 changes: 5 additions & 4 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1440,13 +1440,14 @@ def _factorize_keys(lk, rk, sort=True):
lk = lk.values
rk = rk.values

# if we exactly match in categories, allow us to use codes
# if we exactly match in categories, allow us to factorize on codes
if (is_categorical_dtype(lk) and
is_categorical_dtype(rk) and
lk.is_dtype_equal(rk)):
return lk.codes, rk.codes, len(lk.categories)

if is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk):
klass = libhashtable.Int64Factorizer
lk = _ensure_int64(lk.codes)
rk = _ensure_int64(rk.codes)
elif is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk):
klass = libhashtable.Int64Factorizer
lk = _ensure_int64(com._values_from_object(lk))
rk = _ensure_int64(com._values_from_object(rk))
Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/reshape/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1480,6 +1480,29 @@ def test_dtype_on_merged_different(self, change, how, left, right):
index=['X', 'Y', 'Z'])
assert_series_equal(result, expected)

def test_self_join_multiple_categories(self):
# GH 16767
# non-duplicates should work with multiple categories
m = 5
df = pd.DataFrame({
'a': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] * m,
'b': ['t', 'w', 'x', 'y', 'z'] * 2 * m,
'c': [letter
for each in ['m', 'n', 'u', 'p', 'o']
for letter in [each] * 2 * m],
'd': [letter
for each in ['aa', 'bb', 'cc', 'dd', 'ee',
'ff', 'gg', 'hh', 'ii', 'jj']
for letter in [each] * m]})

# change them all to categorical variables
df = df.apply(lambda x: x.astype('category'))

# self-join should equal ourselves
result = pd.merge(df, df, on=list(df.columns))

assert_frame_equal(result, df)


@pytest.fixture
def left_df():
Expand Down

0 comments on commit f8e4705

Please sign in to comment.