diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 5146bd35dff30..3cffab477e430 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -225,6 +225,7 @@ Sparse Reshaping ^^^^^^^^^ - Joining/Merging with a non unique ``PeriodIndex`` raised a TypeError (:issue:`16871`) +- Merging with categorical date columns raised a TypeError (:issue:`16900`) - Bug when using :func:`isin` on a large object series and large comparison array (:issue:`16012`) - Fixes regression from 0.20, :func:`Series.aggregate` and :func:`DataFrame.aggregate` allow dictionaries as return values again (:issue:`16741`) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index beebe06e7477e..a0aa328ba90d3 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -877,7 +877,7 @@ def _get_merge_keys(self): return left_keys, right_keys, join_names def _maybe_coerce_merge_keys(self): - # we have valid mergee's but we may have to further + # we have valid mergees but we may have to further # coerce these if they are originally incompatible types # # for example if these are categorical, but are not dtype_equal @@ -889,12 +889,16 @@ def _maybe_coerce_merge_keys(self): if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): continue + lk_is_cat = is_categorical_dtype(lk) + rk_is_cat = is_categorical_dtype(rk) + # if either left or right is a categorical # then the must match exactly in categories & ordered - if is_categorical_dtype(lk) and is_categorical_dtype(rk): + if lk_is_cat and rk_is_cat: if lk.is_dtype_equal(rk): continue - elif is_categorical_dtype(lk) or is_categorical_dtype(rk): + + elif lk_is_cat or rk_is_cat: pass elif is_dtype_equal(lk.dtype, rk.dtype): @@ -904,7 +908,7 @@ def _maybe_coerce_merge_keys(self): # kinds to proceed, eg. int64 and int8 # further if we are object, but we infer to # the same, then proceed - if (is_numeric_dtype(lk) and is_numeric_dtype(rk)): + if is_numeric_dtype(lk) and is_numeric_dtype(rk): if lk.dtype.kind == rk.dtype.kind: continue @@ -913,13 +917,20 @@ def _maybe_coerce_merge_keys(self): continue # Houston, we have a problem! - # let's coerce to object + # let's coerce to object if the dtypes aren't + # categorical, otherwise coerce to the category + # dtype. If we coerced categories to object, + # then we would lose type information on some + # columns, and end up trying to merge + # incompatible dtypes. See GH 16900. if name in self.left.columns: + typ = lk.categories.dtype if lk_is_cat else object self.left = self.left.assign( - **{name: self.left[name].astype(object)}) + **{name: self.left[name].astype(typ)}) if name in self.right.columns: + typ = rk.categories.dtype if rk_is_cat else object self.right = self.right.assign( - **{name: self.right[name].astype(object)}) + **{name: self.right[name].astype(typ)}) def _validate_specification(self): # Hm, any way to make this logic less complicated?? diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 919675188576e..765e8e28b43fd 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1,7 +1,7 @@ # pylint: disable=E1103 import pytest -from datetime import datetime +from datetime import datetime, date from numpy.random import randn from numpy import nan import numpy as np @@ -1515,6 +1515,40 @@ def test_self_join_multiple_categories(self): assert_frame_equal(result, df) + def test_dtype_on_categorical_dates(self): + # GH 16900 + # dates should not be coerced to ints + + df = pd.DataFrame( + [[date(2001, 1, 1), 1.1], + [date(2001, 1, 2), 1.3]], + columns=['date', 'num2'] + ) + df['date'] = df['date'].astype('category') + + df2 = pd.DataFrame( + [[date(2001, 1, 1), 1.3], + [date(2001, 1, 3), 1.4]], + columns=['date', 'num4'] + ) + df2['date'] = df2['date'].astype('category') + + expected_outer = pd.DataFrame([ + [pd.Timestamp('2001-01-01'), 1.1, 1.3], + [pd.Timestamp('2001-01-02'), 1.3, np.nan], + [pd.Timestamp('2001-01-03'), np.nan, 1.4]], + columns=['date', 'num2', 'num4'] + ) + result_outer = pd.merge(df, df2, how='outer', on=['date']) + assert_frame_equal(result_outer, expected_outer) + + expected_inner = pd.DataFrame( + [[pd.Timestamp('2001-01-01'), 1.1, 1.3]], + columns=['date', 'num2', 'num4'] + ) + result_inner = pd.merge(df, df2, how='inner', on=['date']) + assert_frame_equal(result_inner, expected_inner) + @pytest.fixture def left_df():