Skip to content

Commit

Permalink
ENH: add MultiIndex.to_dataframe
Browse files Browse the repository at this point in the history
ENH: allow hashing of MultiIndex

closes #12397
  • Loading branch information
jreback committed Jan 24, 2017
1 parent 9309eba commit 7193277
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 12 deletions.
1 change: 1 addition & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1460,6 +1460,7 @@ MultiIndex Components
MultiIndex.set_levels
MultiIndex.set_labels
MultiIndex.to_hierarchical
MultiIndex.to_dataframe
MultiIndex.is_lexsorted
MultiIndex.droplevel
MultiIndex.swaplevel
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ Other enhancements
- New ``UnsortedIndexError`` (subclass of ``KeyError``) raised when indexing/slicing into an
unsorted MultiIndex (:issue:`11897`). This allows differentiation between errors due to lack
of sorting or an incorrect key. See :ref:`here <advanced.unsorted>`

- ``MultiIndex`` has gained a ``.to_dataframe()`` method to convert to a ``DataFrame`` (:issue:`12397`)
- ``pd.cut`` and ``pd.qcut`` now support datetime64 and timedelta64 dtypes (:issue:`14714`, :issue:`14798`)
- ``pd.qcut`` has gained the ``duplicates='raise'|'drop'`` option to control whether to raise on duplicated edges (:issue:`7751`)
- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
Expand Down
24 changes: 24 additions & 0 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -827,6 +827,30 @@ def _to_safe_for_reshape(self):
""" convert to object if we are a categorical """
return self.set_levels([i._to_safe_for_reshape() for i in self.levels])

def to_dataframe(self, index=True):
"""
Create a DataFrame with the columns the levels of the MultiIndex
.. versionadded:: 0.20.0
Parameters
----------
index : boolean, default True
return this MultiIndex as the index
Returns
-------
DataFrame
"""

from pandas import DataFrame
result = DataFrame({(name or level): self.get_level_values(level)
for name, level in
zip(self.names, range(len(self.levels)))})
if index:
result.index = self
return result

def to_hierarchical(self, n_repeat, n_shuffle=1):
"""
Return a MultiIndex reshaped to conform to the
Expand Down
37 changes: 37 additions & 0 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1348,6 +1348,43 @@ def test_format_sparse_config(self):

warnings.filters = warn_filters

def test_to_dataframe(self):
tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]

index = MultiIndex.from_tuples(tuples)
result = index.to_dataframe(index=False)
expected = DataFrame(tuples)
tm.assert_frame_equal(result, expected)

result = index.to_dataframe()
expected.index = index
tm.assert_frame_equal(result, expected)

tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')]
index = MultiIndex.from_tuples(tuples, names=['first', 'second'])
result = index.to_dataframe(index=False)
expected = DataFrame(tuples)
expected.columns = ['first', 'second']
tm.assert_frame_equal(result, expected)

result = index.to_dataframe()
expected.index = index
tm.assert_frame_equal(result, expected)

index = MultiIndex.from_product([range(5),
pd.date_range('20130101', periods=3)])
result = index.to_dataframe(index=False)
expected = DataFrame(
{0: np.repeat(np.arange(5, dtype='int64'), 3),
1: np.tile(pd.date_range('20130101', periods=3), 5)})
tm.assert_frame_equal(result, expected)

index = MultiIndex.from_product([range(5),
pd.date_range('20130101', periods=3)])
result = index.to_dataframe()
expected.index = index
tm.assert_frame_equal(result, expected)

def test_to_hierarchical(self):
index = MultiIndex.from_tuples([(1, 'one'), (1, 'two'), (2, 'one'), (
2, 'two')])
Expand Down
8 changes: 7 additions & 1 deletion pandas/tools/hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""

import numpy as np
from pandas import _hash, Series, factorize, Categorical, Index
from pandas import _hash, Series, factorize, Categorical, Index, MultiIndex
from pandas.lib import is_bool_array
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
from pandas.types.common import (is_categorical_dtype, is_numeric_dtype,
Expand Down Expand Up @@ -45,6 +45,12 @@ def adder(h, hashed_to_add):
h = np.multiply(h, np.uint(3), h)
return np.add(h, hashed_to_add, h)

if isinstance(obj, MultiIndex):
# efficiently turn us into a DataFrame and hash
return hash_pandas_object(obj.to_dataframe(index=False),
index=False, encoding=encoding,
hash_key=hash_key, categorize=categorize)

if isinstance(obj, ABCIndexClass):
h = hash_array(obj.values, encoding, hash_key,
categorize).astype('uint64')
Expand Down
16 changes: 6 additions & 10 deletions pandas/tools/tests/test_hashing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pandas as pd

from pandas import DataFrame, Series, Index
from pandas import DataFrame, Series, Index, MultiIndex
from pandas.tools.hashing import hash_array, hash_pandas_object
import pandas.util.testing as tm

Expand Down Expand Up @@ -72,7 +72,11 @@ def test_hash_pandas_object(self):
tm.makeMixedDataFrame(),
tm.makeTimeDataFrame(),
tm.makeTimeSeries(),
tm.makeTimedeltaIndex()]:
tm.makeTimedeltaIndex(),
MultiIndex.from_product(
[range(5),
['foo', 'bar', 'baz'],
pd.date_range('20130101', periods=2)])]:
self.check_equal(obj)
self.check_not_equal_with_index(obj)

Expand Down Expand Up @@ -140,14 +144,6 @@ def f():
hash_pandas_object(obj)
self.assertRaises(TypeError, f)

# MultiIndex are represented as tuples
obj = Series([1, 2, 3], index=pd.MultiIndex.from_tuples(
[('a', 1), ('a', 2), ('b', 1)]))

def f():
hash_pandas_object(obj)
self.assertRaises(TypeError, f)

def test_alread_encoded(self):
# if already encoded then ok

Expand Down

0 comments on commit 7193277

Please sign in to comment.