From 719327784f37a07a744759f215fcb35374fec2b1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 24 Jan 2017 17:44:31 -0500 Subject: [PATCH] ENH: add MultiIndex.to_dataframe ENH: allow hashing of MultiIndex closes #12397 --- doc/source/api.rst | 1 + doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/indexes/multi.py | 24 +++++++++++++++++++ pandas/tests/indexes/test_multi.py | 37 ++++++++++++++++++++++++++++++ pandas/tools/hashing.py | 8 ++++++- pandas/tools/tests/test_hashing.py | 16 +++++-------- 6 files changed, 76 insertions(+), 12 deletions(-) diff --git a/doc/source/api.rst b/doc/source/api.rst index 04a85bf63a6f88..7215cc3d3574fb 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1460,6 +1460,7 @@ MultiIndex Components MultiIndex.set_levels MultiIndex.set_labels MultiIndex.to_hierarchical + MultiIndex.to_dataframe MultiIndex.is_lexsorted MultiIndex.droplevel MultiIndex.swaplevel diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 6aaed803c5352d..2b7f1f0664bf2f 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -131,7 +131,7 @@ Other enhancements - New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack of sorting or an incorrect key. See :ref:`here ` - +- ``MultiIndex`` has gained a ``.to_dataframe()`` method to convert to a ``DataFrame`` (:issue:`12397`) - ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (:issue:`14714`, :issue:`14798`) - ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`) - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 2afafaeb544d15..99fcf789221388 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -827,6 +827,30 @@ def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ return self.set_levels([i._to_safe_for_reshape() for i in self.levels]) + def to_dataframe(self, index=True): + """ + Create a DataFrame with the columns the levels of the MultiIndex + + .. versionadded:: 0.20.0 + + Parameters + ---------- + index : boolean, default True + return this MultiIndex as the index + + Returns + ------- + DataFrame + """ + + from pandas import DataFrame + result = DataFrame({(name or level): self.get_level_values(level) + for name, level in + zip(self.names, range(len(self.levels)))}) + if index: + result.index = self + return result + def to_hierarchical(self, n_repeat, n_shuffle=1): """ Return a MultiIndex reshaped to conform to the diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 343078aeafaf00..b72dc84b337dc3 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -1348,6 +1348,43 @@ def test_format_sparse_config(self): warnings.filters = warn_filters + def test_to_dataframe(self): + tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')] + + index = MultiIndex.from_tuples(tuples) + result = index.to_dataframe(index=False) + expected = DataFrame(tuples) + tm.assert_frame_equal(result, expected) + + result = index.to_dataframe() + expected.index = index + tm.assert_frame_equal(result, expected) + + tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')] + index = MultiIndex.from_tuples(tuples, names=['first', 'second']) + result = index.to_dataframe(index=False) + expected = DataFrame(tuples) + expected.columns = ['first', 'second'] + tm.assert_frame_equal(result, expected) + + result = index.to_dataframe() + expected.index = index + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_product([range(5), + pd.date_range('20130101', periods=3)]) + result = index.to_dataframe(index=False) + expected = DataFrame( + {0: np.repeat(np.arange(5, dtype='int64'), 3), + 1: np.tile(pd.date_range('20130101', periods=3), 5)}) + tm.assert_frame_equal(result, expected) + + index = MultiIndex.from_product([range(5), + pd.date_range('20130101', periods=3)]) + result = index.to_dataframe() + expected.index = index + tm.assert_frame_equal(result, expected) + def test_to_hierarchical(self): index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), ( 2, 'two')]) diff --git a/pandas/tools/hashing.py b/pandas/tools/hashing.py index 6d2186fdab34c5..7e7b7dc76bb74c 100644 --- a/pandas/tools/hashing.py +++ b/pandas/tools/hashing.py @@ -3,7 +3,7 @@ """ import numpy as np -from pandas import _hash, Series, factorize, Categorical, Index +from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex from pandas.lib import is_bool_array from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame from pandas.types.common import (is_categorical_dtype, is_numeric_dtype, @@ -45,6 +45,12 @@ def adder(h, hashed_to_add): h = np.multiply(h, np.uint(3), h) return np.add(h, hashed_to_add, h) + if isinstance(obj, MultiIndex): + # efficiently turn us into a DataFrame and hash + return hash_pandas_object(obj.to_dataframe(index=False), + index=False, encoding=encoding, + hash_key=hash_key, categorize=categorize) + if isinstance(obj, ABCIndexClass): h = hash_array(obj.values, encoding, hash_key, categorize).astype('uint64') diff --git a/pandas/tools/tests/test_hashing.py b/pandas/tools/tests/test_hashing.py index 7913706f5658b1..f3205f6ef8d5e7 100644 --- a/pandas/tools/tests/test_hashing.py +++ b/pandas/tools/tests/test_hashing.py @@ -1,7 +1,7 @@ import numpy as np import pandas as pd -from pandas import DataFrame, Series, Index +from pandas import DataFrame, Series, Index, MultiIndex from pandas.tools.hashing import hash_array, hash_pandas_object import pandas.util.testing as tm @@ -72,7 +72,11 @@ def test_hash_pandas_object(self): tm.makeMixedDataFrame(), tm.makeTimeDataFrame(), tm.makeTimeSeries(), - tm.makeTimedeltaIndex()]: + tm.makeTimedeltaIndex(), + MultiIndex.from_product( + [range(5), + ['foo', 'bar', 'baz'], + pd.date_range('20130101', periods=2)])]: self.check_equal(obj) self.check_not_equal_with_index(obj) @@ -140,14 +144,6 @@ def f(): hash_pandas_object(obj) self.assertRaises(TypeError, f) - # MultiIndex are represented as tuples - obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples( - [('a', 1), ('a', 2), ('b', 1)])) - - def f(): - hash_pandas_object(obj) - self.assertRaises(TypeError, f) - def test_alread_encoded(self): # if already encoded then ok