Stop concat from attempting to sort mismatched columns by default (#2…

…0613) * Stop concat from attempting to sort mismatched columns by default Preserve column order upon concatenation to obey least astonishment principle. Allow old behavior to be enabled by adding a boolean switch to concat and DataFrame.append, mismatch_sort, which is by default disabled. Closes #4588
pandas-dev · May 1, 2018 · c4da79b · c4da79b
1 parent 93e7123
commit c4da79b
Show file tree

Hide file tree

Showing 19 changed files with 400 additions and 91 deletions.
diff --git a/doc/source/merging.rst b/doc/source/merging.rst
@@ -153,10 +153,10 @@ Set logic on the other axes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 When gluing together multiple DataFrames, you have a choice of how to handle
-the other axes (other than the one being concatenated). This can be done in 
+the other axes (other than the one being concatenated). This can be done in
 the following three ways:
 
-- Take the (sorted) union of them all, ``join='outer'``. This is the default
+- Take the union of them all, ``join='outer'``. This is the default
   option as it results in zero information loss.
 - Take the intersection, ``join='inner'``.
 - Use a specific index, as passed to the ``join_axes`` argument.
@@ -167,10 +167,10 @@ behavior:
 .. ipython:: python
 
    df4 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'],
-                    'D': ['D2', 'D3', 'D6', 'D7'],
-                    'F': ['F2', 'F3', 'F6', 'F7']},
-                   index=[2, 3, 6, 7])
-   result = pd.concat([df1, df4], axis=1)
+                       'D': ['D2', 'D3', 'D6', 'D7'],
+                       'F': ['F2', 'F3', 'F6', 'F7']},
+                      index=[2, 3, 6, 7])
+   result = pd.concat([df1, df4], axis=1, sort=False)
 
 
 .. ipython:: python
@@ -181,8 +181,16 @@ behavior:
           labels=['df1', 'df4'], vertical=False);
    plt.close('all');
 
-Note that the row indexes have been unioned and sorted. Here is the same thing
-with ``join='inner'``:
+.. warning::
+
+   .. versionchanged:: 0.23.0
+
+   The default behavior with ``join='outer'`` is to sort the other axis
+   (columns in this case). In a future version of pandas, the default will
+   be to not sort. We specified ``sort=False`` to opt in to the new
+   behavior now.
+
+Here is the same thing with ``join='inner'``:
 
 .. ipython:: python
 

diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt
@@ -694,6 +694,36 @@ Returning a ``Series`` allows one to control the exact return structure and colu
 
     df.apply(lambda x: Series([1, 2, 3], index=['D', 'E', 'F']), axis=1)
 
+.. _whatsnew_0230.api_breaking.concat:
+
+Concatenation will no longer sort
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In a future version of pandas :func:`pandas.concat` will no longer sort the non-concatenation axis when it is not already aligned.
+The current behavior is the same as the previous (sorting), but now a warning is issued when ``sort`` is not specified and the non-concatenation axis is not aligned (:issue:`4588`).
+
+.. ipython:: python
+   :okwarning:
+
+   df1 = pd.DataFrame({"a": [1, 2], "b": [1, 2]}, columns=['b', 'a'])
+   df2 = pd.DataFrame({"a": [4, 5]})
+
+   pd.concat([df1, df2])
+
+To keep the previous behavior (sorting) and silence the warning, pass ``sort=True``
+
+.. ipython:: python
+
+   pd.concat([df1, df2], sort=True)
+
+To accept the future behavior (no sorting), pass ``sort=False``
+
+.. ipython
+
+   pd.concat([df1, df2], sort=False)
+
+Note that this change also applies to :meth:`DataFrame.append`, which has also received a ``sort`` keyword for controlling this behavior.
+
 
 .. _whatsnew_0230.api_breaking.build_changes:
 

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -157,7 +157,7 @@ def fast_unique_multiple(list arrays):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def fast_unique_multiple_list(list lists):
+def fast_unique_multiple_list(list lists, bint sort=True):
     cdef:
         list buf
         Py_ssize_t k = len(lists)
@@ -174,10 +174,11 @@ def fast_unique_multiple_list(list lists):
             if val not in table:
                 table[val] = stub
                 uniques.append(val)
-    try:
-        uniques.sort()
-    except Exception:
-        pass
+    if sort:
+        try:
+            uniques.sort()
+        except Exception:
+            pass
 
     return uniques
 

diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -507,7 +507,7 @@ def is_any_frame():
                            for r in compat.itervalues(result))
 
             if isinstance(result, list):
-                return concat(result, keys=keys, axis=1), True
+                return concat(result, keys=keys, axis=1, sort=True), True
 
             elif is_any_frame():
                 # we have a dict of DataFrames

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -6073,7 +6073,8 @@ def infer(x):
     # ----------------------------------------------------------------------
     # Merging / joining methods
 
-    def append(self, other, ignore_index=False, verify_integrity=False):
+    def append(self, other, ignore_index=False,
+               verify_integrity=False, sort=None):
         """
         Append rows of `other` to the end of this frame, returning a new
         object. Columns not in this frame are added as new columns.
@@ -6086,6 +6087,14 @@ def append(self, other, ignore_index=False, verify_integrity=False):
             If True, do not use the index labels.
         verify_integrity : boolean, default False
             If True, raise ValueError on creating index with duplicates.
+        sort : boolean, default None
+            Sort columns if the columns of `self` and `other` are not aligned.
+            The default sorting is deprecated and will change to not-sorting
+            in a future version of pandas. Explicitly pass ``sort=True`` to
+            silence the warning and sort. Explicitly pass ``sort=False`` to
+            silence the warning and not sort.
+
+            .. versionadded:: 0.23.0
 
         Returns
         -------
@@ -6197,7 +6206,8 @@ def append(self, other, ignore_index=False, verify_integrity=False):
         else:
             to_concat = [self, other]
         return concat(to_concat, ignore_index=ignore_index,
-                      verify_integrity=verify_integrity)
+                      verify_integrity=verify_integrity,
+                      sort=sort)
 
     def join(self, other, on=None, how='left', lsuffix='', rsuffix='',
              sort=False):
@@ -7516,7 +7526,7 @@ def _list_of_series_to_arrays(data, columns, coerce_float=False, dtype=None):
     from pandas.core.index import _get_objs_combined_axis
 
     if columns is None:
-        columns = _get_objs_combined_axis(data)
+        columns = _get_objs_combined_axis(data, sort=False)
 
     indexer_cache = {}
 

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1101,7 +1101,8 @@ def reset_identity(values):
                 group_names = self.grouper.names
 
                 result = concat(values, axis=self.axis, keys=group_keys,
-                                levels=group_levels, names=group_names)
+                                levels=group_levels, names=group_names,
+                                sort=False)
             else:
 
                 # GH5610, returns a MI, with the first level being a

diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py
@@ -1,3 +1,6 @@
+import textwrap
+import warnings
+
 from pandas.core.indexes.base import (Index,
                                       _new_Index,
                                       _ensure_index,
@@ -17,6 +20,16 @@
 from pandas._libs import lib
 from pandas._libs.tslib import NaT
 
+_sort_msg = textwrap.dedent("""\
+Sorting because non-concatenation axis is not aligned. A future version
+of pandas will change to not sort by default.
+
+To accept the future behavior, pass 'sort=True'.
+
+To retain the current behavior and silence the warning, pass sort=False
+""")
+
+
 # TODO: there are many places that rely on these private methods existing in
 # pandas.core.index
 __all__ = ['Index', 'MultiIndex', 'NumericIndex', 'Float64Index', 'Int64Index',
@@ -31,33 +44,40 @@
            '_all_indexes_same']
 
 
-def _get_objs_combined_axis(objs, intersect=False, axis=0):
+def _get_objs_combined_axis(objs, intersect=False, axis=0, sort=True):
     # Extract combined index: return intersection or union (depending on the
     # value of "intersect") of indexes on given axis, or None if all objects
     # lack indexes (e.g. they are numpy arrays)
     obs_idxes = [obj._get_axis(axis) for obj in objs
                  if hasattr(obj, '_get_axis')]
     if obs_idxes:
-        return _get_combined_index(obs_idxes, intersect=intersect)
+        return _get_combined_index(obs_idxes, intersect=intersect, sort=sort)
 
 
-def _get_combined_index(indexes, intersect=False):
+def _get_combined_index(indexes, intersect=False, sort=False):
     # TODO: handle index names!
     indexes = com._get_distinct_objs(indexes)
     if len(indexes) == 0:
-        return Index([])
-    if len(indexes) == 1:
-        return indexes[0]
-    if intersect:
+        index = Index([])
+    elif len(indexes) == 1:
+        index = indexes[0]
+    elif intersect:
         index = indexes[0]
         for other in indexes[1:]:
             index = index.intersection(other)
-        return index
-    union = _union_indexes(indexes)
-    return _ensure_index(union)
+    else:
+        index = _union_indexes(indexes, sort=sort)
+        index = _ensure_index(index)
+
+    if sort:
+        try:
+            index = index.sort_values()
+        except TypeError:
+            pass
+    return index
 
 
-def _union_indexes(indexes):
+def _union_indexes(indexes, sort=True):
     if len(indexes) == 0:
         raise AssertionError('Must have at least 1 Index to union')
     if len(indexes) == 1:
@@ -74,7 +94,8 @@ def conv(i):
                 i = i.tolist()
             return i
 
-        return Index(lib.fast_unique_multiple_list([conv(i) for i in inds]))
+        return Index(
+            lib.fast_unique_multiple_list([conv(i) for i in inds], sort=sort))
 
     if kind == 'special':
         result = indexes[0]
@@ -89,13 +110,19 @@ def conv(i):
         index = indexes[0]
         for other in indexes[1:]:
             if not index.equals(other):
+
+                if sort is None:
+                    # TODO: remove once pd.concat sort default changes
+                    warnings.warn(_sort_msg, FutureWarning, stacklevel=8)
+                    sort = True
+
                 return _unique_indices(indexes)
 
         name = _get_consensus_names(indexes)[0]
         if name != index.name:
             index = index._shallow_copy(name=name)
         return index
-    else:
+    else:  # kind='list'
         return _unique_indices(indexes)
 
 

diff --git a/pandas/core/panel.py b/pandas/core/panel.py
@@ -1499,8 +1499,11 @@ def _extract_axis(self, data, axis=0, intersect=False):
                 raw_lengths.append(v.shape[axis])
 
         if have_frames:
+            # we want the "old" behavior here, of sorting only
+            # 1. we're doing a union (intersect=False)
+            # 2. the indices are not aligned.
             index = _get_objs_combined_axis(data.values(), axis=axis,
-                                            intersect=intersect)
+                                            intersect=intersect, sort=None)
 
         if have_raw_arrays:
             lengths = list(set(raw_lengths))

diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -20,7 +20,7 @@
 
 def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
            keys=None, levels=None, names=None, verify_integrity=False,
-           copy=True):
+           sort=None, copy=True):
     """
     Concatenate pandas objects along a particular axis with optional set logic
     along the other axes.
@@ -60,6 +60,19 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
     verify_integrity : boolean, default False
         Check whether the new concatenated axis contains duplicates. This can
         be very expensive relative to the actual data concatenation
+    sort : boolean, default None
+        Sort non-concatenation axis if it is not already aligned when `join`
+        is 'outer'. The current default of sorting is deprecated and will
+        change to not-sorting in a future version of pandas.
+
+        Explicitly pass ``sort=True`` to silence the warning and sort.
+        Explicitly pass ``sort=False`` to silence the warning and not sort.
+
+        This has no effect when ``join='inner'``, which already preserves
+        the order of the non-concatenation axis.
+
+        .. versionadded:: 0.23.0
+
     copy : boolean, default True
         If False, do not copy data unnecessarily
 
@@ -209,7 +222,7 @@ def concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
                        ignore_index=ignore_index, join=join,
                        keys=keys, levels=levels, names=names,
                        verify_integrity=verify_integrity,
-                       copy=copy)
+                       copy=copy, sort=sort)
     return op.get_result()
 
 
@@ -220,7 +233,8 @@ class _Concatenator(object):
 
     def __init__(self, objs, axis=0, join='outer', join_axes=None,
                  keys=None, levels=None, names=None,
-                 ignore_index=False, verify_integrity=False, copy=True):
+                 ignore_index=False, verify_integrity=False, copy=True,
+                 sort=False):
         if isinstance(objs, (NDFrame, compat.string_types)):
             raise TypeError('first argument must be an iterable of pandas '
                             'objects, you passed an object of type '
@@ -355,6 +369,7 @@ def __init__(self, objs, axis=0, join='outer', join_axes=None,
         self.keys = keys
         self.names = names or getattr(keys, 'names', None)
         self.levels = levels
+        self.sort = sort
 
         self.ignore_index = ignore_index
         self.verify_integrity = verify_integrity
@@ -447,7 +462,8 @@ def _get_comb_axis(self, i):
         data_axis = self.objs[0]._get_block_manager_axis(i)
         try:
             return _get_objs_combined_axis(self.objs, axis=data_axis,
-                                           intersect=self.intersect)
+                                           intersect=self.intersect,
+                                           sort=self.sort)
         except IndexError:
             types = [type(x).__name__ for x in self.objs]
             raise TypeError("Cannot concatenate list of {types}"

diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -449,7 +449,8 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,
     rownames = _get_names(index, rownames, prefix='row')
     colnames = _get_names(columns, colnames, prefix='col')
 
-    common_idx = _get_objs_combined_axis(index + columns, intersect=True)
+    common_idx = _get_objs_combined_axis(index + columns, intersect=True,
+                                         sort=False)
 
     data = {}
     data.update(zip(rownames, index))

diff --git a/pandas/tests/frame/test_combine_concat.py b/pandas/tests/frame/test_combine_concat.py
@@ -96,7 +96,7 @@ def test_append_series_dict(self):
 
         result = df.append(series[::-1][:3], ignore_index=True)
         expected = df.append(DataFrame({0: series[::-1][:3]}).T,
-                             ignore_index=True)
+                             ignore_index=True, sort=True)
         assert_frame_equal(result, expected.loc[:, result.columns])
 
         # can append when name set
@@ -119,8 +119,8 @@ def test_append_list_of_series_dicts(self):
         # different columns
         dicts = [{'foo': 1, 'bar': 2, 'baz': 3, 'peekaboo': 4},
                  {'foo': 5, 'bar': 6, 'baz': 7, 'peekaboo': 8}]
-        result = df.append(dicts, ignore_index=True)
-        expected = df.append(DataFrame(dicts), ignore_index=True)
+        result = df.append(dicts, ignore_index=True, sort=True)
+        expected = df.append(DataFrame(dicts), ignore_index=True, sort=True)
         assert_frame_equal(result, expected)
 
     def test_append_empty_dataframe(self):