From 8843c10485fcc402ba3016b9825ee0c76284c67f Mon Sep 17 00:00:00 2001 From: Dave Willmer Date: Sun, 16 Jul 2017 21:55:34 +0100 Subject: [PATCH 1/8] Fix TypeError when merging categorical dates --- pandas/core/reshape/merge.py | 12 ++++++++++-- pandas/tests/reshape/test_merge.py | 23 ++++++++++++++++++++++- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index beebe06e7477e..97adf2aa39710 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -24,6 +24,7 @@ is_numeric_dtype, is_integer, is_int_or_datetime_dtype, + is_datetimelike, is_dtype_equal, is_bool, is_list_like, @@ -877,7 +878,7 @@ def _get_merge_keys(self): return left_keys, right_keys, join_names def _maybe_coerce_merge_keys(self): - # we have valid mergee's but we may have to further + # we have valid mergees but we may have to further # coerce these if they are originally incompatible types # # for example if these are categorical, but are not dtype_equal @@ -894,6 +895,13 @@ def _maybe_coerce_merge_keys(self): if is_categorical_dtype(lk) and is_categorical_dtype(rk): if lk.is_dtype_equal(rk): continue + + # if we are dates with differing categories + # then allow them to proceed because + # coercing to object below results in integers. + if is_datetimelike(lk.categories) and is_datetimelike(rk.categories): + continue + elif is_categorical_dtype(lk) or is_categorical_dtype(rk): pass @@ -904,7 +912,7 @@ def _maybe_coerce_merge_keys(self): # kinds to proceed, eg. int64 and int8 # further if we are object, but we infer to # the same, then proceed - if (is_numeric_dtype(lk) and is_numeric_dtype(rk)): + if is_numeric_dtype(lk) and is_numeric_dtype(rk): if lk.dtype.kind == rk.dtype.kind: continue diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 919675188576e..d21a8419b384c 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1,7 +1,7 @@ # pylint: disable=E1103 import pytest -from datetime import datetime +from datetime import datetime, date from numpy.random import randn from numpy import nan import numpy as np @@ -1515,6 +1515,27 @@ def test_self_join_multiple_categories(self): assert_frame_equal(result, df) + def test_dtype_on_categorical_dates(self): + # GH 16900 + # dates should not be coerced to ints + + df = pd.DataFrame( + [[date(2001, 1, 1), 1.1], + [date(2001, 1, 2), 1.3]], + columns=['date', 'num2'] + ) + df['date'] = df['date'].astype('category') + + df2 = pd.DataFrame( + [[date(2001, 1, 1), 1.3], + [date(2001, 1, 3), 1.4]], + columns=['date', 'num4'] + ) + df2['date'] = df2['date'].astype('category') + + result = pd.merge(df, df2, how='outer', on=['date']) + assert result['date'].dtype == 'category' + @pytest.fixture def left_df(): From 48e71632e598e0e4e3763fbe53339b0aeee84159 Mon Sep 17 00:00:00 2001 From: Dave Willmer Date: Sun, 16 Jul 2017 22:20:18 +0100 Subject: [PATCH 2/8] Test inner join --- pandas/tests/reshape/test_merge.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index d21a8419b384c..93177c7e29ffb 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1536,6 +1536,9 @@ def test_dtype_on_categorical_dates(self): result = pd.merge(df, df2, how='outer', on=['date']) assert result['date'].dtype == 'category' + result_inner = pd.merge(df, df2, how='inner', on=['date']) + assert result_inner['date'].dtype == 'category' + @pytest.fixture def left_df(): From 218da661a485b3c7c92030e5edaded2e876691e8 Mon Sep 17 00:00:00 2001 From: Dave Willmer Date: Mon, 17 Jul 2017 20:53:19 +0100 Subject: [PATCH 3/8] Generic solution to categorical problem --- pandas/core/reshape/merge.py | 12 ++++-------- pandas/tests/reshape/test_merge.py | 9 ++++++--- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 97adf2aa39710..1b6a948dee61f 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -896,12 +896,6 @@ def _maybe_coerce_merge_keys(self): if lk.is_dtype_equal(rk): continue - # if we are dates with differing categories - # then allow them to proceed because - # coercing to object below results in integers. - if is_datetimelike(lk.categories) and is_datetimelike(rk.categories): - continue - elif is_categorical_dtype(lk) or is_categorical_dtype(rk): pass @@ -923,11 +917,13 @@ def _maybe_coerce_merge_keys(self): # Houston, we have a problem! # let's coerce to object if name in self.left.columns: + typ = lk.categories.dtype if is_categorical_dtype(lk) else object self.left = self.left.assign( - **{name: self.left[name].astype(object)}) + **{name: self.left[name].astype(typ)}) if name in self.right.columns: + typ = rk.categories.dtype if is_categorical_dtype(rk) else object self.right = self.right.assign( - **{name: self.right[name].astype(object)}) + **{name: self.right[name].astype(typ)}) def _validate_specification(self): # Hm, any way to make this logic less complicated?? diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 93177c7e29ffb..396ee6ec2375f 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1515,7 +1515,7 @@ def test_self_join_multiple_categories(self): assert_frame_equal(result, df) - def test_dtype_on_categorical_dates(self): + def test_categorical_dates(self): # GH 16900 # dates should not be coerced to ints @@ -1534,10 +1534,13 @@ def test_dtype_on_categorical_dates(self): df2['date'] = df2['date'].astype('category') result = pd.merge(df, df2, how='outer', on=['date']) - assert result['date'].dtype == 'category' + assert result.shape == (3, 3) + assert result['date'].iloc[0] == pd.Timestamp('2001-01-01') + assert result['date'].iloc[-1] == pd.Timestamp('2001-01-03') result_inner = pd.merge(df, df2, how='inner', on=['date']) - assert result_inner['date'].dtype == 'category' + assert result_inner.shape == (1, 3) + assert result_inner['date'].iloc[-1] == pd.Timestamp('2001-01-01') @pytest.fixture From a81933d0b6b56e0fb0314ec649a466a1feaf582e Mon Sep 17 00:00:00 2001 From: Dave Willmer Date: Mon, 17 Jul 2017 21:04:43 +0100 Subject: [PATCH 4/8] Remove unused import --- pandas/core/reshape/merge.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 1b6a948dee61f..4d83d1d78ee35 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -24,7 +24,6 @@ is_numeric_dtype, is_integer, is_int_or_datetime_dtype, - is_datetimelike, is_dtype_equal, is_bool, is_list_like, From b82d11740ea762861392751da85cc9a1b02d76ba Mon Sep 17 00:00:00 2001 From: Dave Willmer Date: Mon, 17 Jul 2017 21:36:20 +0100 Subject: [PATCH 5/8] Lint fixes --- pandas/core/reshape/merge.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4d83d1d78ee35..7ca4a1165c782 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -916,11 +916,13 @@ def _maybe_coerce_merge_keys(self): # Houston, we have a problem! # let's coerce to object if name in self.left.columns: - typ = lk.categories.dtype if is_categorical_dtype(lk) else object + cat = is_categorical_dtype(lk) + typ = lk.categories.dtype if cat else object self.left = self.left.assign( **{name: self.left[name].astype(typ)}) if name in self.right.columns: - typ = rk.categories.dtype if is_categorical_dtype(rk) else object + cat = is_categorical_dtype(rk) + typ = rk.categories.dtype if cat else object self.right = self.right.assign( **{name: self.right[name].astype(typ)}) From 5e8e23b43dcfe6d2f21f8a7f9f64b48c4b337a4d Mon Sep 17 00:00:00 2001 From: Dave Willmer Date: Wed, 19 Jul 2017 23:25:12 +0100 Subject: [PATCH 6/8] Add whatsnew item --- doc/source/whatsnew/v0.21.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 7c52cf6f450b2..7f1007d25e9ce 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -186,6 +186,7 @@ Sparse Reshaping ^^^^^^^^^ - Joining/Merging with a non unique ``PeriodIndex`` raised a TypeError (:issue:`16871`) +- Merging with categorical date columns raised a TypeError (:issue:`16900`) Numeric From 04d540401802f8092d72458b792a249760da848b Mon Sep 17 00:00:00 2001 From: Dave Willmer Date: Wed, 19 Jul 2017 23:55:10 +0100 Subject: [PATCH 7/8] Update tests --- pandas/tests/reshape/test_merge.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/pandas/tests/reshape/test_merge.py b/pandas/tests/reshape/test_merge.py index 396ee6ec2375f..765e8e28b43fd 100644 --- a/pandas/tests/reshape/test_merge.py +++ b/pandas/tests/reshape/test_merge.py @@ -1515,7 +1515,7 @@ def test_self_join_multiple_categories(self): assert_frame_equal(result, df) - def test_categorical_dates(self): + def test_dtype_on_categorical_dates(self): # GH 16900 # dates should not be coerced to ints @@ -1533,14 +1533,21 @@ def test_categorical_dates(self): ) df2['date'] = df2['date'].astype('category') - result = pd.merge(df, df2, how='outer', on=['date']) - assert result.shape == (3, 3) - assert result['date'].iloc[0] == pd.Timestamp('2001-01-01') - assert result['date'].iloc[-1] == pd.Timestamp('2001-01-03') + expected_outer = pd.DataFrame([ + [pd.Timestamp('2001-01-01'), 1.1, 1.3], + [pd.Timestamp('2001-01-02'), 1.3, np.nan], + [pd.Timestamp('2001-01-03'), np.nan, 1.4]], + columns=['date', 'num2', 'num4'] + ) + result_outer = pd.merge(df, df2, how='outer', on=['date']) + assert_frame_equal(result_outer, expected_outer) + expected_inner = pd.DataFrame( + [[pd.Timestamp('2001-01-01'), 1.1, 1.3]], + columns=['date', 'num2', 'num4'] + ) result_inner = pd.merge(df, df2, how='inner', on=['date']) - assert result_inner.shape == (1, 3) - assert result_inner['date'].iloc[-1] == pd.Timestamp('2001-01-01') + assert_frame_equal(result_inner, expected_inner) @pytest.fixture From 1ea197744f83c6af99bc8ddace582c8eea3e877e Mon Sep 17 00:00:00 2001 From: Dave Willmer Date: Thu, 20 Jul 2017 21:29:06 +0100 Subject: [PATCH 8/8] Minor tweaks + comment --- pandas/core/reshape/merge.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 7ca4a1165c782..a0aa328ba90d3 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -889,13 +889,16 @@ def _maybe_coerce_merge_keys(self): if (len(lk) and not len(rk)) or (not len(lk) and len(rk)): continue + lk_is_cat = is_categorical_dtype(lk) + rk_is_cat = is_categorical_dtype(rk) + # if either left or right is a categorical # then the must match exactly in categories & ordered - if is_categorical_dtype(lk) and is_categorical_dtype(rk): + if lk_is_cat and rk_is_cat: if lk.is_dtype_equal(rk): continue - elif is_categorical_dtype(lk) or is_categorical_dtype(rk): + elif lk_is_cat or rk_is_cat: pass elif is_dtype_equal(lk.dtype, rk.dtype): @@ -914,15 +917,18 @@ def _maybe_coerce_merge_keys(self): continue # Houston, we have a problem! - # let's coerce to object + # let's coerce to object if the dtypes aren't + # categorical, otherwise coerce to the category + # dtype. If we coerced categories to object, + # then we would lose type information on some + # columns, and end up trying to merge + # incompatible dtypes. See GH 16900. if name in self.left.columns: - cat = is_categorical_dtype(lk) - typ = lk.categories.dtype if cat else object + typ = lk.categories.dtype if lk_is_cat else object self.left = self.left.assign( **{name: self.left[name].astype(typ)}) if name in self.right.columns: - cat = is_categorical_dtype(rk) - typ = rk.categories.dtype if cat else object + typ = rk.categories.dtype if rk_is_cat else object self.right = self.right.assign( **{name: self.right[name].astype(typ)})