BUG: DataFrame.sort_index broken if not both lexsorted and monotonic …

…in levels closes #15622 closes #15687 closes #14015 closes #13431 Author: Jeff Reback <jeff@reback.net> Closes #15694 from jreback/sort3 and squashes the following commits: bd17d2b [Jeff Reback] rename sort_index_montonic -> _sort_index_monotonic 31097fc [Jeff Reback] add doc-strings, rename sort_monotonic -> sort_levels_monotonic 48249ab [Jeff Reback] add doc example 527c3a6 [Jeff Reback] simpler algo for remove_used_levels 520c9c1 [Jeff Reback] versionadded tags f2ddc9c [Jeff Reback] replace _reconstruct with: sort_monotonic, and remove_unused_levels (public) 3c4ca22 [Jeff Reback] add degenerate test case 269cb3b [Jeff Reback] small doc updates b234bdb [Jeff Reback] support for removing unused levels (internally) 7be8941 [Jeff Reback] incorrectly raising KeyError rather than UnsortedIndexError, caught by doc-example 47c67d6 [Jeff Reback] BUG: construct MultiIndex identically from levels/labels when concatting
pandas-dev · Apr 7, 2017 · f478e4f · f478e4f
1 parent 3b53202
commit f478e4f
Show file tree

Hide file tree

Showing 15 changed files with 593 additions and 57 deletions.
diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
@@ -292,7 +292,10 @@ def setup(self):
         self.rng3 = date_range(start='1/1/2000', periods=1500000, freq='S')
         self.ts3 = Series(1, index=self.rng3)
 
-    def time_sort_index(self):
+    def time_sort_index_monotonic(self):
+        self.ts2.sort_index()
+
+    def time_sort_index_non_monotonic(self):
         self.ts.sort_index()
 
     def time_timeseries_slice_minutely(self):

diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst
@@ -136,7 +136,7 @@ can find yourself working with hierarchically-indexed data without creating a
 may wish to generate your own ``MultiIndex`` when preparing the data set.
 
 Note that how the index is displayed by be controlled using the
-``multi_sparse`` option in ``pandas.set_printoptions``:
+``multi_sparse`` option in ``pandas.set_options()``:
 
 .. ipython:: python
 
@@ -175,35 +175,40 @@ completely analogous way to selecting a column in a regular DataFrame:
 See :ref:`Cross-section with hierarchical index <advanced.xs>` for how to select
 on a deeper level.
 
-.. note::
+.. _advanced.shown_levels:
+
+Defined Levels
+~~~~~~~~~~~~~~
+
+The repr of a ``MultiIndex`` shows ALL the defined levels of an index, even
+if the they are not actually used. When slicing an index, you may notice this.
+For example:
 
-   The repr of a ``MultiIndex`` shows ALL the defined levels of an index, even
-   if the they are not actually used. When slicing an index, you may notice this.
-   For example:
+.. ipython:: python
 
-   .. ipython:: python
+   # original multi-index
+   df.columns
 
-      # original multi-index
-      df.columns
+   # sliced
+   df[['foo','qux']].columns
 
-      # sliced
-      df[['foo','qux']].columns
+This is done to avoid a recomputation of the levels in order to make slicing
+highly performant. If you want to see the actual used levels.
 
-   This is done to avoid a recomputation of the levels in order to make slicing
-   highly performant. If you want to see the actual used levels.
+.. ipython:: python
 
-   .. ipython:: python
+   df[['foo','qux']].columns.values
 
-      df[['foo','qux']].columns.values
+   # for a specific level
+   df[['foo','qux']].columns.get_level_values(0)
 
-      # for a specific level
-      df[['foo','qux']].columns.get_level_values(0)
+To reconstruct the multiindex with only the used levels
 
-   To reconstruct the multiindex with only the used levels
+.. versionadded:: 0.20.0
 
-   .. ipython:: python
+.. ipython:: python
 
-      pd.MultiIndex.from_tuples(df[['foo','qux']].columns.values)
+   df[['foo','qux']].columns.remove_unused_levels()
 
 Data alignment and using ``reindex``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -288,7 +293,7 @@ As usual, **both sides** of the slicers are included as this is label indexing.
 
    .. code-block:: python
 
-      df.loc[(slice('A1','A3'),.....),:]
+      df.loc[(slice('A1','A3'),.....), :]
 
    rather than this:
 
@@ -317,51 +322,51 @@ Basic multi-index slicing using slices, lists, and labels.
 
 .. ipython:: python
 
-   dfmi.loc[(slice('A1','A3'),slice(None), ['C1','C3']),:]
+   dfmi.loc[(slice('A1','A3'), slice(None), ['C1', 'C3']), :]
 
 You can use a ``pd.IndexSlice`` to have a more natural syntax using ``:`` rather than using ``slice(None)``
 
 .. ipython:: python
 
    idx = pd.IndexSlice
-   dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']]
+   dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']]
 
 It is possible to perform quite complicated selections using this method on multiple
 axes at the same time.
 
 .. ipython:: python
 
-   dfmi.loc['A1',(slice(None),'foo')]
-   dfmi.loc[idx[:,:,['C1','C3']],idx[:,'foo']]
+   dfmi.loc['A1', (slice(None), 'foo')]
+   dfmi.loc[idx[:, :, ['C1', 'C3']], idx[:, 'foo']]
 
 Using a boolean indexer you can provide selection related to the *values*.
 
 .. ipython:: python
 
-   mask = dfmi[('a','foo')]>200
-   dfmi.loc[idx[mask,:,['C1','C3']],idx[:,'foo']]
+   mask = dfmi[('a', 'foo')] > 200
+   dfmi.loc[idx[mask, :, ['C1', 'C3']], idx[:, 'foo']]
 
 You can also specify the ``axis`` argument to ``.loc`` to interpret the passed
 slicers on a single axis.
 
 .. ipython:: python
 
-   dfmi.loc(axis=0)[:,:,['C1','C3']]
+   dfmi.loc(axis=0)[:, :, ['C1', 'C3']]
 
 Furthermore you can *set* the values using these methods
 
 .. ipython:: python
 
    df2 = dfmi.copy()
-   df2.loc(axis=0)[:,:,['C1','C3']] = -10
+   df2.loc(axis=0)[:, :, ['C1', 'C3']] = -10
    df2
 
 You can use a right-hand-side of an alignable object as well.
 
 .. ipython:: python
 
    df2 = dfmi.copy()
-   df2.loc[idx[:,:,['C1','C3']],:] = df2*1000
+   df2.loc[idx[:, :, ['C1', 'C3']], :] = df2 * 1000
    df2
 
 .. _advanced.xs:

diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -1432,6 +1432,7 @@ MultiIndex Components
    MultiIndex.droplevel
    MultiIndex.swaplevel
    MultiIndex.reorder_levels
+   MultiIndex.remove_unused_levels
 
 .. _api.datetimeindex:
 

diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -366,6 +366,8 @@ Other Enhancements
 - ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
 - ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`)
 - ``pd.read_csv()`` will now raise a ``csv.Error`` error whenever an end-of-file character is encountered in the middle of a data row (:issue:`15913`)
+- A new function has been added to a ``MultiIndex`` to facilitate :ref:`Removing Unused Levels <advanced.shown_levels>`. (:issue:`15694`)
+- :func:`MultiIndex.remove_unused_levels` has been added to facilitate :ref:`removing unused levels <advanced.shown_levels>`. (:issue:`15694`)
 
 
 .. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
@@ -714,6 +716,72 @@ If indicated, a deprecation warning will be issued if you reference that module.
     "pandas._hash", "pandas.tools.libhash", ""
     "pandas._window", "pandas.core.libwindow", ""
 
+.. _whatsnew_0200.api_breaking.sort_index:
+
+DataFrame.sort_index changes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In certain cases, calling ``.sort_index()`` on a MultiIndexed DataFrame would return the *same* DataFrame without seeming to sort.
+This would happen with a ``lexsorted``, but non-monotonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`)
+
+This is UNCHANGED between versions, but showing for illustration purposes:
+
+.. ipython:: python
+
+    df = DataFrame(np.arange(6), columns=['value'], index=MultiIndex.from_product([list('BA'), range(3)]))
+    df
+
+.. ipython:: python
+
+    df.index.is_lexsorted()
+    df.index.is_monotonic
+
+Sorting works as expected
+
+.. ipython:: python
+
+    df.sort_index()
+
+.. ipython:: python
+
+    df.sort_index().index.is_lexsorted()
+    df.sort_index().index.is_monotonic
+
+However, this example, which has a non-monotonic 2nd level,
+doesn't behave as desired.
+
+.. ipython:: python
+   df = pd.DataFrame(
+           {'value': [1, 2, 3, 4]},
+            index=pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']],
+                                labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
+
+Previous Behavior:
+
+.. ipython:: python
+
+   In [11]: df.sort_index()
+   Out[11]:
+         value
+   a bb      1
+     aa      2
+   b bb      3
+     aa      4
+
+   In [14]: df.sort_index().index.is_lexsorted()
+   Out[14]: True
+
+   In [15]: df.sort_index().index.is_monotonic
+   Out[15]: False
+
+New Behavior:
+
+.. ipython:: python
+
+   df.sort_index()
+   df.sort_index().index.is_lexsorted()
+   df.sort_index().index.is_monotonic
+
 
 .. _whatsnew_0200.api_breaking.groupby_describe:
 
@@ -965,7 +1033,7 @@ Performance Improvements
 - Improve performance of ``pd.core.groupby.GroupBy.apply`` when the applied
   function used the ``.name`` attribute of the group DataFrame (:issue:`15062`).
 - Improved performance of ``iloc`` indexing with a list or array (:issue:`15504`).
-
+- Improved performance of ``Series.sort_index()`` with a monotonic index (:issue:`15694`)
 
 .. _whatsnew_0200.bug_fixes:
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3322,6 +3322,10 @@ def trans(v):
     def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
                    kind='quicksort', na_position='last', sort_remaining=True,
                    by=None):
+
+        # TODO: this can be combined with Series.sort_index impl as
+        # almost identical
+
         inplace = validate_bool_kwarg(inplace, 'inplace')
         # 10726
         if by is not None:
@@ -3335,8 +3339,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
         axis = self._get_axis_number(axis)
         labels = self._get_axis(axis)
 
-        # sort by the index
-        if level is not None:
+        if level:
 
             new_axis, indexer = labels.sortlevel(level, ascending=ascending,
                                                  sort_remaining=sort_remaining)
@@ -3346,17 +3349,14 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
 
             # make sure that the axis is lexsorted to start
             # if not we need to reconstruct to get the correct indexer
-            if not labels.is_lexsorted():
-                labels = MultiIndex.from_tuples(labels.values)
-
+            labels = labels._sort_levels_monotonic()
             indexer = lexsort_indexer(labels.labels, orders=ascending,
                                       na_position=na_position)
         else:
             from pandas.core.sorting import nargsort
 
-            # GH11080 - Check monotonic-ness before sort an index
-            # if monotonic (already sorted), return None or copy() according
-            # to 'inplace'
+            # Check monotonic-ness before sort an index
+            # GH11080
             if ((ascending and labels.is_monotonic_increasing) or
                     (not ascending and labels.is_monotonic_decreasing)):
                 if inplace:
@@ -3367,8 +3367,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
             indexer = nargsort(labels, kind=kind, ascending=ascending,
                                na_position=na_position)
 
+        baxis = self._get_block_manager_axis(axis)
         new_data = self._data.take(indexer,
-                                   axis=self._get_block_manager_axis(axis),
+                                   axis=baxis,
                                    convert=False, verify=False)
 
         if inplace:

diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -1882,6 +1882,13 @@ def get_group_levels(self):
         'ohlc': lambda *args: ['open', 'high', 'low', 'close']
     }
 
+    def _is_builtin_func(self, arg):
+        """
+        if we define an builtin function for this argument, return it,
+        otherwise return the arg
+        """
+        return SelectionMixin._builtin_table.get(arg, arg)
+
     def _get_cython_function(self, kind, how, values, is_numeric):
 
         dtype_str = values.dtype.name
@@ -2107,7 +2114,7 @@ def _aggregate_series_fast(self, obj, func):
         # avoids object / Series creation overhead
         dummy = obj._get_values(slice(None, 0)).to_dense()
         indexer = get_group_index_sorter(group_index, ngroups)
-        obj = obj.take(indexer, convert=False)
+        obj = obj.take(indexer, convert=False).to_dense()
         group_index = algorithms.take_nd(
             group_index, indexer, allow_fill=False)
         grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,

diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -22,8 +22,8 @@
 from pandas.sparse.libsparse import IntIndex
 
 from pandas.core.categorical import Categorical, _factorize_from_iterable
-from pandas.core.sorting import (get_group_index, compress_group_index,
-                                 decons_obs_group_ids)
+from pandas.core.sorting import (get_group_index, get_compressed_ids,
+                                 compress_group_index, decons_obs_group_ids)
 
 import pandas.core.algorithms as algos
 from pandas._libs import algos as _algos, reshape as _reshape
@@ -496,11 +496,6 @@ def _unstack_frame(obj, level, fill_value=None):
         return unstacker.get_result()
 
 
-def get_compressed_ids(labels, sizes):
-    ids = get_group_index(labels, sizes, sort=True, xnull=False)
-    return compress_group_index(ids, sort=True)
-
-
 def stack(frame, level=-1, dropna=True):
     """
     Convert DataFrame to Series with multi-level Index. Columns become the

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1751,17 +1751,31 @@ def _try_kind_sort(arr):
     def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
                    kind='quicksort', na_position='last', sort_remaining=True):
 
+        # TODO: this can be combined with DataFrame.sort_index impl as
+        # almost identical
         inplace = validate_bool_kwarg(inplace, 'inplace')
         axis = self._get_axis_number(axis)
         index = self.index
-        if level is not None:
+
+        if level:
             new_index, indexer = index.sortlevel(level, ascending=ascending,
                                                  sort_remaining=sort_remaining)
         elif isinstance(index, MultiIndex):
             from pandas.core.sorting import lexsort_indexer
-            indexer = lexsort_indexer(index.labels, orders=ascending)
+            labels = index._sort_levels_monotonic()
+            indexer = lexsort_indexer(labels.labels, orders=ascending)
         else:
             from pandas.core.sorting import nargsort
+
+            # Check monotonic-ness before sort an index
+            # GH11080
+            if ((ascending and index.is_monotonic_increasing) or
+                    (not ascending and index.is_monotonic_decreasing)):
+                if inplace:
+                    return
+                else:
+                    return self.copy()
+
             indexer = nargsort(index, kind=kind, ascending=ascending,
                                na_position=na_position)