BUG/API: .merge() and .join() on category dtype columns will now pres…

…erve category dtype closes #10409 Author: Jeff Reback <jeff@reback.net> Closes #15321 from jreback/merge_cat and squashes the following commits: 3671dad [Jeff Reback] DOC: merge docs a4b2ee6 [Jeff Reback] BUG/API: .merge() and .join() on category dtype columns will now preserve the category dtype when possible
pandas-dev · Mar 10, 2017 · 026e748 · jorisvandenbossche · Mar 13, 2017 · jreback
1 parent 5dee1f1
commit 026e748
Show file tree

Hide file tree

Showing 10 changed files with 364 additions and 71 deletions.
diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py
@@ -6,7 +6,7 @@
     from pandas import ordered_merge as merge_ordered
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Append
 
 class Append(object):
@@ -35,7 +35,7 @@ def time_append_mixed(self):
         self.mdf1.append(self.mdf2)
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Concat
 
 class Concat(object):
@@ -120,7 +120,7 @@ def time_f_ordered_axis1(self):
         concat(self.frames_f, axis=1, ignore_index=True)
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Joins
 
 class Join(object):
@@ -202,7 +202,7 @@ def time_join_non_unique_equal(self):
         (self.fracofday * self.temp[self.fracofday.index])
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Merges
 
 class Merge(object):
@@ -257,7 +257,31 @@ def time_i8merge(self):
         merge(self.left, self.right, how='outer')
 
 
-#----------------------------------------------------------------------
+class MergeCategoricals(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.left_object = pd.DataFrame(
+            {'X': np.random.choice(range(0, 10), size=(10000,)),
+             'Y': np.random.choice(['one', 'two', 'three'], size=(10000,))})
+
+        self.right_object = pd.DataFrame(
+            {'X': np.random.choice(range(0, 10), size=(10000,)),
+             'Z': np.random.choice(['jjj', 'kkk', 'sss'], size=(10000,))})
+
+        self.left_cat = self.left_object.assign(
+            Y=self.left_object['Y'].astype('category'))
+        self.right_cat = self.right_object.assign(
+            Z=self.right_object['Z'].astype('category'))
+
+    def time_merge_object(self):
+        merge(self.left_object, self.right_object, on='X')
+
+    def time_merge_cat(self):
+        merge(self.left_cat, self.right_cat, on='X')
+
+
+# ----------------------------------------------------------------------
 # Ordered merge
 
 class MergeOrdered(object):
@@ -332,7 +356,7 @@ def time_multiby(self):
         merge_asof(self.df1e, self.df2e, on='time', by=['key', 'key2'])
 
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # data alignment
 
 class Align(object):

diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst
@@ -646,6 +646,9 @@ In this case the categories are not the same and so an error is raised:
 
 The same applies to ``df.append(df_different)``.
 
+See also the section on :ref:`merge dtypes<merging.dtypes>` for notes about preserving merge dtypes and performance.
+
+
 .. _categorical.union:
 
 Unioning

diff --git a/doc/source/merging.rst b/doc/source/merging.rst
@@ -746,6 +746,79 @@ The ``indicator`` argument will also accept string arguments, in which case the
    pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')
 
 
+.. _merging.dtypes:
+
+Merge Dtypes
+~~~~~~~~~~~~
+
+.. versionadded:: 0.19.0
+
+Merging will preserve the dtype of the join keys.
+
+.. ipython:: python
+
+   left = pd.DataFrame({'key': [1], 'v1': [10]})
+   left
+   right = pd.DataFrame({'key': [1, 2], 'v1': [20, 30]})
+   right
+
+We are able to preserve the join keys
+
+.. ipython:: python
+
+   pd.merge(left, right, how='outer')
+   pd.merge(left, right, how='outer').dtypes
+
+Of course if you have missing values that are introduced, then the
+resulting dtype will be upcast.
+
+.. ipython:: python
+
+   pd.merge(left, right, how='outer', on='key')
+   pd.merge(left, right, how='outer', on='key').dtypes
+
+.. versionadded:: 0.20.0
+
+Merging will preserve ``category`` dtypes of the mergands.
+
+The left frame.
+
+.. ipython:: python
+
+   X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,)))
+   X = X.astype('category', categories=['foo', 'bar'])
+
+   left = DataFrame({'X': X,
+                     'Y': np.random.choice(['one', 'two', 'three'], size=(10,))})
+   left
+   left.dtypes
+
+The right frame.
+
+.. ipython:: python
+
+   right = DataFrame({'X': Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']),
+                      'Z': [1, 2]})
+   right
+   right.dtypes
+
+The merged result
+
+.. ipython:: python
+
+   result = pd.merge(left, right, how='outer')
+   result
+   result.dtypes
+
+.. note::
+
+   The category dtypes must be *exactly* the same, meaning the same categories and the ordered attribute.
+   Otherwise the result will coerce to ``object`` dtype.
+
+.. note::
+
+   Merging on ``category`` dtypes that are the same can be quite performant compared to ``object`` dtype merging.
+
 .. _merging.join.index:
 
 Joining on index

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -692,7 +692,7 @@ Other API Changes
 - Reorganization of timeseries development tests (:issue:`14854`)
 - Specific support for ``copy.copy()`` and ``copy.deepcopy()`` functions on NDFrame objects (:issue:`15444`)
 - ``Series.sort_values()`` accepts a one element list of bool for consistency with the behavior of ``DataFrame.sort_values()`` (:issue:`15604`)
-- ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`)
+- ``.merge()`` and ``.join()`` on ``category`` dtype columns will now preserve the category dtype when possible (:issue:`10409`)
 
 .. _whatsnew_0200.deprecations:
 
@@ -733,6 +733,7 @@ Removal of prior version deprecations/changes
 - ``Series.is_time_series`` is dropped in favor of ``Series.index.is_all_dates`` (:issue:`15098`)
 - The deprecated ``irow``, ``icol``, ``iget`` and ``iget_value`` methods are removed
   in favor of ``iloc`` and ``iat`` as explained :ref:`here <whatsnew_0170.deprecations>` (:issue:`10711`).
+- The deprecated ``DataFrame.iterkv()`` has been removed in favor of ``DataFrame.iteritems()`` (:issue:`10711`)
 
 
 .. _whatsnew_0200.performance:
@@ -749,6 +750,7 @@ Performance Improvements
 - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. (:issue:`14947`)
 - Improved performance of ``.rank()`` for categorical data (:issue:`15498`)
 - Improved performance when using ``.unstack()`` (:issue:`15503`)
+- Improved performance of merge/join on ``category`` columns (:issue:`10409`)
 
 
 .. _whatsnew_0200.bug_fixes:

diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -5227,6 +5227,8 @@ def get_reindexed_values(self, empty_dtype, upcasted_na):
                 # External code requested filling/upcasting, bool values must
                 # be upcasted to object to avoid being upcasted to numeric.
                 values = self.block.astype(np.object_).values
+            elif self.block.is_categorical:
+                values = self.block.values
             else:
                 # No dtype upcasting is done here, it will be performed during
                 # concatenation itself.

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -4097,9 +4097,12 @@ def test_merge(self):
         expected = df.copy()
 
         # object-cat
+        # note that we propogate the category
+        # because we don't have any matching rows
         cright = right.copy()
         cright['d'] = cright['d'].astype('category')
         result = pd.merge(left, cright, how='left', left_on='b', right_on='c')
+        expected['d'] = expected['d'].astype('category', categories=['null'])
         tm.assert_frame_equal(result, expected)
 
         # cat-object