Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

return empty MultiIndex for symmetrical difference on equal MultiIndexes #16486

Merged
merged 12 commits into from
May 31, 2017
66 changes: 33 additions & 33 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from pandas.compat.numpy import function as nv
from pandas import compat


from pandas.core.dtypes.generic import ABCSeries, ABCMultiIndex, ABCPeriodIndex
from pandas.core.dtypes.missing import isnull, array_equivalent
from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -53,7 +52,6 @@
from pandas.core.strings import StringAccessorMixin
from pandas.core.config import get_option


# simplify
default_pprint = lambda x, max_seq_items=None: \
pprint_thing(x, escape_chars=('\t', '\r', '\n'), quote_strings=True,
Expand Down Expand Up @@ -182,8 +180,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
elif isinstance(data, (np.ndarray, Index, ABCSeries)):

if (is_datetime64_any_dtype(data) or
(dtype is not None and is_datetime64_any_dtype(dtype)) or
'tz' in kwargs):
(dtype is not None and is_datetime64_any_dtype(dtype)) or
'tz' in kwargs):
from pandas.core.indexes.datetimes import DatetimeIndex
result = DatetimeIndex(data, copy=copy, name=name,
dtype=dtype, **kwargs)
Expand All @@ -193,7 +191,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
return result

elif (is_timedelta64_dtype(data) or
(dtype is not None and is_timedelta64_dtype(dtype))):
(dtype is not None and is_timedelta64_dtype(dtype))):
from pandas.core.indexes.timedeltas import TimedeltaIndex
result = TimedeltaIndex(data, copy=copy, name=name, **kwargs)
if dtype is not None and _o_dtype == dtype:
Expand Down Expand Up @@ -297,7 +295,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None,
elif inferred != 'string':
if inferred.startswith('datetime'):
if (lib.is_datetime_with_singletz_array(subarr) or
'tz' in kwargs):
'tz' in kwargs):
# only when subarr has the same tz
from pandas.core.indexes.datetimes import (
DatetimeIndex)
Expand Down Expand Up @@ -903,7 +901,7 @@ def best_len(values):
# line, then don't justify
if (is_truncated or
not (len(', '.join(head)) < display_width and
len(', '.join(tail)) < display_width)):
len(', '.join(tail)) < display_width)):
max_len = max(best_len(head), best_len(tail))
head = [x.rjust(max_len) for x in head]
tail = [x.rjust(max_len) for x in tail]
Expand Down Expand Up @@ -1053,7 +1051,7 @@ def nlevels(self):
return 1

def _get_names(self):
return FrozenList((self.name, ))
return FrozenList((self.name,))

def _set_names(self, values, level=None):
if len(values) != 1:
Expand Down Expand Up @@ -1257,7 +1255,7 @@ def _convert_scalar_indexer(self, key, kind=None):
if kind == 'iloc':
return self._validate_indexer('positional', key, kind)

if len(self) and not isinstance(self, ABCMultiIndex,):
if len(self) and not isinstance(self, ABCMultiIndex, ):

# we can raise here if we are definitive that this
# is positional indexing (eg. .ix on with a float)
Expand Down Expand Up @@ -1465,8 +1463,8 @@ def _invalid_indexer(self, form, key):
""" consistent invalid indexer message """
raise TypeError("cannot do {form} indexing on {klass} with these "
"indexers [{key}] of {kind}".format(
form=form, klass=type(self), key=key,
kind=type(key)))
form=form, klass=type(self), key=key,
kind=type(key)))

def get_duplicates(self):
from collections import defaultdict
Expand Down Expand Up @@ -1500,7 +1498,7 @@ def _validate_index_level(self, level):
if isinstance(level, int):
if level < 0 and level != -1:
raise IndexError("Too many levels: Index has only 1 level,"
" %d is not a valid level number" % (level, ))
" %d is not a valid level number" % (level,))
elif level > 0:
raise IndexError("Too many levels:"
" Index has only 1 level, not %d" %
Expand Down Expand Up @@ -2320,11 +2318,15 @@ def symmetric_difference(self, other, result_name=None):
except TypeError:
pass

attribs = self._get_attributes_dict()
attribs['name'] = result_name
if 'freq' in attribs:
attribs['freq'] = None
return self._shallow_copy_with_infer(the_diff, **attribs)
if self.nlevels > 1 and len(the_diff) == 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could use a comment explaining why we need the special case.

return type(self)([[] for _ in range(self.nlevels)],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use self._shallow_copy

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That wouldn't work, self._shallow_copy fails if the_diff is empty. That's why I am returning an empty MI instead.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In [2]: pd.MultiIndex([[], []], [[],[]])
Out[2]: 
MultiIndex(levels=[[], []],
           labels=[[], []])

In [3]: pd.MultiIndex([[], []], [[],[]])._shallow_copy()
Out[3]: 
MultiIndex(levels=[[], []],
           labels=[[], []])

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I was confusing it with self._shallow_copy_with_infer. But still, if I call self._shallow_copy with the_diff being empty from_tuples gives me TypeError: Cannot infer number of levels from empty list

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jreback what do you think about a special case in MulltiIndex._shallow_copy when values is length 0 (that's what is passed from symmetric_difference)?

    @Appender(_index_shared_docs['_shallow_copy'])
    def _shallow_copy(self, values=None, **kwargs):
        if values is not None:
            if 'name' in kwargs:
                kwargs['names'] = kwargs.pop('name', None)
            # discards freq
            kwargs.pop('freq', None)
            # this if block is new
            if len(values) == 0:
                return MultiIndex(levels=[[] for _ in range(self.nlevels)],
                                  labels=[[] for _ in range(self.nlabels)])
            return MultiIndex.from_tuples(values, **kwargs)
        return self.view()

this would "work" but I don't know if "array of length 0" means same structure, but empty. Maybe it's ok.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this change should be done in _shallow_copy_with_infer, you need to construct a MI equiv of self[0:0]

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this change should be done in _shallow_copy_with_infer, you need to construct a MI equiv of self[0:0]

This leaves around some levels unfortunately:

In [16]: idx = pd.MultiIndex.from_product([['a', 'b'], ['A', 'B']])

In [17]: idx[0:0]
Out[17]:
MultiIndex(levels=[['a', 'b'], ['A', 'B']],
           labels=[[], []])

In [18]: idx.difference(idx) | idx.difference(idx)
Out[18]:
MultiIndex(levels=[[], []],
           labels=[[], []])

I believe we want Out[18]

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, then I guess you can special case values=None in _shallow_copy to take a different path of construction (IOW don't use the MultiIndex.from_tuples). The crucial point is to propogate the meta-data.

[[] for _ in range(self.nlevels)])
else:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can remove this else condition, and just dedent everything like it was before.

attribs = self._get_attributes_dict()
attribs['name'] = result_name
if 'freq' in attribs:
attribs['freq'] = None
return self._shallow_copy_with_infer(the_diff, **attribs)

sym_diff = deprecate('sym_diff', symmetric_difference)

Expand Down Expand Up @@ -3369,7 +3371,7 @@ def _maybe_cast_slice_bound(self, label, side, kind):
# reject them
if is_float(label):
if not (kind in ['ix'] and (self.holds_integer() or
self.is_floating())):
self.is_floating())):
self._invalid_indexer('slice', label)

# we are trying to find integer bounds on a non-integer based index
Expand All @@ -3387,7 +3389,7 @@ def _searchsorted_monotonic(self, label, side='left'):
# everything for it to work (element ordering, search side and
# resulting value).
pos = self[::-1].searchsorted(label, side='right' if side == 'left'
else 'right')
else 'right')
return len(self) - pos

raise ValueError('index must be monotonic increasing or decreasing')
Expand Down Expand Up @@ -3418,7 +3420,7 @@ def get_slice_bound(self, label, side, kind):
if side not in ('left', 'right'):
raise ValueError("Invalid value for side kwarg,"
" must be either 'left' or 'right': %s" %
(side, ))
(side,))

original_label = label

Expand Down Expand Up @@ -3665,7 +3667,7 @@ def _evaluate_compare(self, other):
return self._evaluate_compare(other, op)

if (is_object_dtype(self) and
self.nlevels == 1):
self.nlevels == 1):

# don't pass MultiIndex
with np.errstate(all='ignore'):
Expand Down Expand Up @@ -3741,9 +3743,9 @@ def _validate_for_numeric_unaryop(self, op, opstr):
if not self._is_numeric_dtype:
raise TypeError("cannot evaluate a numeric op "
"{opstr} for type: {typ}".format(
opstr=opstr,
typ=type(self))
)
opstr=opstr,
typ=type(self))
)

def _validate_for_numeric_binop(self, other, op, opstr):
"""
Expand All @@ -3759,17 +3761,17 @@ def _validate_for_numeric_binop(self, other, op, opstr):
if not self._is_numeric_dtype:
raise TypeError("cannot evaluate a numeric op {opstr} "
"for type: {typ}".format(
opstr=opstr,
typ=type(self))
)
opstr=opstr,
typ=type(self))
)

if isinstance(other, Index):
if not other._is_numeric_dtype:
raise TypeError("cannot evaluate a numeric op "
"{opstr} with type: {typ}".format(
opstr=type(self),
typ=type(other))
)
opstr=type(self),
typ=type(other))
)
elif isinstance(other, np.ndarray) and not other.ndim:
other = other.item()

Expand Down Expand Up @@ -3866,9 +3868,7 @@ def _add_numeric_methods_unary(cls):
""" add in numeric unary methods """

def _make_evaluate_unary(op, opstr):

def _evaluate_numeric_unary(self):

self._validate_for_numeric_unaryop(op, opstr)
attrs = self._get_attributes_dict()
attrs = self._maybe_update_attributes(attrs)
Expand Down Expand Up @@ -3909,7 +3909,7 @@ def _make_logical_function(name, desc, f):
def logical_func(self, *args, **kwargs):
result = f(self.values)
if (isinstance(result, (np.ndarray, ABCSeries, Index)) and
result.ndim == 0):
result.ndim == 0):
# return NumPy type
return result.dtype.type(result.item())
else: # pragma: no cover
Expand Down
9 changes: 7 additions & 2 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,6 @@ def test_constructor_ndarray_like(self):
# it should be possible to convert any object that satisfies the numpy
# ndarray interface directly into an Index
class ArrayLike(object):

def __init__(self, array):
self.array = array

Expand Down Expand Up @@ -246,7 +245,6 @@ def test_index_ctor_infer_nan_nat(self):
[np.timedelta64('nat'), np.nan],
[pd.NaT, np.timedelta64('nat')],
[np.timedelta64('nat'), pd.NaT]]:

tm.assert_index_equal(Index(data), exp)
tm.assert_index_equal(Index(np.array(data, dtype=object)), exp)

Expand Down Expand Up @@ -909,6 +907,13 @@ def test_symmetric_difference(self):
expected = MultiIndex.from_tuples([('bar', 2), ('baz', 3), ('bar', 3)])
assert tm.equalContents(result, expected)

# on equal multiIndexs
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you move this to it's own dedicated test, and add a comment with a link to the github issue?

idx1 = MultiIndex.from_tuples(self.tuples)
idx2 = MultiIndex.from_tuples(self.tuples)
result = idx1.symmetric_difference(idx2)
expected = MultiIndex(levels=[[], []], labels=[[], []])
assert tm.equalContents(result, expected)

# nans:
# GH 13514 change: {nan} - {nan} == {}
# (GH 6444, sorting of nans, is no longer an issue)
Expand Down