Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix TypeError when merging categorical dates #16986

Closed
wants to merge 10 commits into from
12 changes: 10 additions & 2 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
is_numeric_dtype,
is_integer,
is_int_or_datetime_dtype,
is_datetimelike,
is_dtype_equal,
is_bool,
is_list_like,
Expand Down Expand Up @@ -877,7 +878,7 @@ def _get_merge_keys(self):
return left_keys, right_keys, join_names

def _maybe_coerce_merge_keys(self):
# we have valid mergee's but we may have to further
# we have valid mergees but we may have to further
# coerce these if they are originally incompatible types
#
# for example if these are categorical, but are not dtype_equal
Expand All @@ -894,6 +895,13 @@ def _maybe_coerce_merge_keys(self):
if is_categorical_dtype(lk) and is_categorical_dtype(rk):
if lk.is_dtype_equal(rk):
continue

# if we are dates with differing categories
# then allow them to proceed because
# coercing to object below results in integers.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need better soln this is too special casey

if is_datetimelike(lk.categories) and is_datetimelike(rk.categories):
continue

elif is_categorical_dtype(lk) or is_categorical_dtype(rk):
pass

Expand All @@ -904,7 +912,7 @@ def _maybe_coerce_merge_keys(self):
# kinds to proceed, eg. int64 and int8
# further if we are object, but we infer to
# the same, then proceed
if (is_numeric_dtype(lk) and is_numeric_dtype(rk)):
if is_numeric_dtype(lk) and is_numeric_dtype(rk):
if lk.dtype.kind == rk.dtype.kind:
continue

Expand Down
23 changes: 22 additions & 1 deletion pandas/tests/reshape/test_merge.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# pylint: disable=E1103

import pytest
from datetime import datetime
from datetime import datetime, date
from numpy.random import randn
from numpy import nan
import numpy as np
Expand Down Expand Up @@ -1515,6 +1515,27 @@ def test_self_join_multiple_categories(self):

assert_frame_equal(result, df)

def test_dtype_on_categorical_dates(self):
# GH 16900
# dates should not be coerced to ints

df = pd.DataFrame(
[[date(2001, 1, 1), 1.1],
[date(2001, 1, 2), 1.3]],
columns=['date', 'num2']
)
df['date'] = df['date'].astype('category')

df2 = pd.DataFrame(
[[date(2001, 1, 1), 1.3],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

need testing on inner as well

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use parametrize instead of duplicating code here

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you can do this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this has been updated as per your previous comment:

construct the expected result and use tm.assert_frame_equal for both examples

did you want it changed to use parametrize instead?

[date(2001, 1, 3), 1.4]],
columns=['date', 'num4']
)
df2['date'] = df2['date'].astype('category')

result = pd.merge(df, df2, how='outer', on=['date'])
assert result['date'].dtype == 'category'


@pytest.fixture
def left_df():
Expand Down