-
-
Notifications
You must be signed in to change notification settings - Fork 18k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
return empty MultiIndex for symmetrical difference on equal MultiIndexes #16486
Changes from 1 commit
4903b20
743aa47
0200ca0
efb5f86
7eeb626
0dee788
104d8e6
f49bcfc
85b4b55
f2d24a3
4cd39a9
6c67a4a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,7 +12,6 @@ | |
from pandas.compat.numpy import function as nv | ||
from pandas import compat | ||
|
||
|
||
from pandas.core.dtypes.generic import ABCSeries, ABCMultiIndex, ABCPeriodIndex | ||
from pandas.core.dtypes.missing import isnull, array_equivalent | ||
from pandas.core.dtypes.common import ( | ||
|
@@ -53,7 +52,6 @@ | |
from pandas.core.strings import StringAccessorMixin | ||
from pandas.core.config import get_option | ||
|
||
|
||
# simplify | ||
default_pprint = lambda x, max_seq_items=None: \ | ||
pprint_thing(x, escape_chars=('\t', '\r', '\n'), quote_strings=True, | ||
|
@@ -182,8 +180,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, | |
elif isinstance(data, (np.ndarray, Index, ABCSeries)): | ||
|
||
if (is_datetime64_any_dtype(data) or | ||
(dtype is not None and is_datetime64_any_dtype(dtype)) or | ||
'tz' in kwargs): | ||
(dtype is not None and is_datetime64_any_dtype(dtype)) or | ||
'tz' in kwargs): | ||
from pandas.core.indexes.datetimes import DatetimeIndex | ||
result = DatetimeIndex(data, copy=copy, name=name, | ||
dtype=dtype, **kwargs) | ||
|
@@ -193,7 +191,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, | |
return result | ||
|
||
elif (is_timedelta64_dtype(data) or | ||
(dtype is not None and is_timedelta64_dtype(dtype))): | ||
(dtype is not None and is_timedelta64_dtype(dtype))): | ||
from pandas.core.indexes.timedeltas import TimedeltaIndex | ||
result = TimedeltaIndex(data, copy=copy, name=name, **kwargs) | ||
if dtype is not None and _o_dtype == dtype: | ||
|
@@ -297,7 +295,7 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, | |
elif inferred != 'string': | ||
if inferred.startswith('datetime'): | ||
if (lib.is_datetime_with_singletz_array(subarr) or | ||
'tz' in kwargs): | ||
'tz' in kwargs): | ||
# only when subarr has the same tz | ||
from pandas.core.indexes.datetimes import ( | ||
DatetimeIndex) | ||
|
@@ -903,7 +901,7 @@ def best_len(values): | |
# line, then don't justify | ||
if (is_truncated or | ||
not (len(', '.join(head)) < display_width and | ||
len(', '.join(tail)) < display_width)): | ||
len(', '.join(tail)) < display_width)): | ||
max_len = max(best_len(head), best_len(tail)) | ||
head = [x.rjust(max_len) for x in head] | ||
tail = [x.rjust(max_len) for x in tail] | ||
|
@@ -1053,7 +1051,7 @@ def nlevels(self): | |
return 1 | ||
|
||
def _get_names(self): | ||
return FrozenList((self.name, )) | ||
return FrozenList((self.name,)) | ||
|
||
def _set_names(self, values, level=None): | ||
if len(values) != 1: | ||
|
@@ -1257,7 +1255,7 @@ def _convert_scalar_indexer(self, key, kind=None): | |
if kind == 'iloc': | ||
return self._validate_indexer('positional', key, kind) | ||
|
||
if len(self) and not isinstance(self, ABCMultiIndex,): | ||
if len(self) and not isinstance(self, ABCMultiIndex, ): | ||
|
||
# we can raise here if we are definitive that this | ||
# is positional indexing (eg. .ix on with a float) | ||
|
@@ -1465,8 +1463,8 @@ def _invalid_indexer(self, form, key): | |
""" consistent invalid indexer message """ | ||
raise TypeError("cannot do {form} indexing on {klass} with these " | ||
"indexers [{key}] of {kind}".format( | ||
form=form, klass=type(self), key=key, | ||
kind=type(key))) | ||
form=form, klass=type(self), key=key, | ||
kind=type(key))) | ||
|
||
def get_duplicates(self): | ||
from collections import defaultdict | ||
|
@@ -1500,7 +1498,7 @@ def _validate_index_level(self, level): | |
if isinstance(level, int): | ||
if level < 0 and level != -1: | ||
raise IndexError("Too many levels: Index has only 1 level," | ||
" %d is not a valid level number" % (level, )) | ||
" %d is not a valid level number" % (level,)) | ||
elif level > 0: | ||
raise IndexError("Too many levels:" | ||
" Index has only 1 level, not %d" % | ||
|
@@ -2320,11 +2318,15 @@ def symmetric_difference(self, other, result_name=None): | |
except TypeError: | ||
pass | ||
|
||
attribs = self._get_attributes_dict() | ||
attribs['name'] = result_name | ||
if 'freq' in attribs: | ||
attribs['freq'] = None | ||
return self._shallow_copy_with_infer(the_diff, **attribs) | ||
if self.nlevels > 1 and len(the_diff) == 0: | ||
return type(self)([[] for _ in range(self.nlevels)], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That wouldn't work, self._shallow_copy fails if the_diff is empty. That's why I am returning an empty MI instead. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry I was confusing it with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jreback what do you think about a special case in @Appender(_index_shared_docs['_shallow_copy'])
def _shallow_copy(self, values=None, **kwargs):
if values is not None:
if 'name' in kwargs:
kwargs['names'] = kwargs.pop('name', None)
# discards freq
kwargs.pop('freq', None)
# this if block is new
if len(values) == 0:
return MultiIndex(levels=[[] for _ in range(self.nlevels)],
labels=[[] for _ in range(self.nlabels)])
return MultiIndex.from_tuples(values, **kwargs)
return self.view() this would "work" but I don't know if "array of length 0" means same structure, but empty. Maybe it's ok. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this change should be done in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This leaves around some levels unfortunately: In [16]: idx = pd.MultiIndex.from_product([['a', 'b'], ['A', 'B']])
In [17]: idx[0:0]
Out[17]:
MultiIndex(levels=[['a', 'b'], ['A', 'B']],
labels=[[], []])
In [18]: idx.difference(idx) | idx.difference(idx)
Out[18]:
MultiIndex(levels=[[], []],
labels=[[], []]) I believe we want There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, then I guess you can special case |
||
[[] for _ in range(self.nlevels)]) | ||
else: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can remove this |
||
attribs = self._get_attributes_dict() | ||
attribs['name'] = result_name | ||
if 'freq' in attribs: | ||
attribs['freq'] = None | ||
return self._shallow_copy_with_infer(the_diff, **attribs) | ||
|
||
sym_diff = deprecate('sym_diff', symmetric_difference) | ||
|
||
|
@@ -3369,7 +3371,7 @@ def _maybe_cast_slice_bound(self, label, side, kind): | |
# reject them | ||
if is_float(label): | ||
if not (kind in ['ix'] and (self.holds_integer() or | ||
self.is_floating())): | ||
self.is_floating())): | ||
self._invalid_indexer('slice', label) | ||
|
||
# we are trying to find integer bounds on a non-integer based index | ||
|
@@ -3387,7 +3389,7 @@ def _searchsorted_monotonic(self, label, side='left'): | |
# everything for it to work (element ordering, search side and | ||
# resulting value). | ||
pos = self[::-1].searchsorted(label, side='right' if side == 'left' | ||
else 'right') | ||
else 'right') | ||
return len(self) - pos | ||
|
||
raise ValueError('index must be monotonic increasing or decreasing') | ||
|
@@ -3418,7 +3420,7 @@ def get_slice_bound(self, label, side, kind): | |
if side not in ('left', 'right'): | ||
raise ValueError("Invalid value for side kwarg," | ||
" must be either 'left' or 'right': %s" % | ||
(side, )) | ||
(side,)) | ||
|
||
original_label = label | ||
|
||
|
@@ -3665,7 +3667,7 @@ def _evaluate_compare(self, other): | |
return self._evaluate_compare(other, op) | ||
|
||
if (is_object_dtype(self) and | ||
self.nlevels == 1): | ||
self.nlevels == 1): | ||
|
||
# don't pass MultiIndex | ||
with np.errstate(all='ignore'): | ||
|
@@ -3741,9 +3743,9 @@ def _validate_for_numeric_unaryop(self, op, opstr): | |
if not self._is_numeric_dtype: | ||
raise TypeError("cannot evaluate a numeric op " | ||
"{opstr} for type: {typ}".format( | ||
opstr=opstr, | ||
typ=type(self)) | ||
) | ||
opstr=opstr, | ||
typ=type(self)) | ||
) | ||
|
||
def _validate_for_numeric_binop(self, other, op, opstr): | ||
""" | ||
|
@@ -3759,17 +3761,17 @@ def _validate_for_numeric_binop(self, other, op, opstr): | |
if not self._is_numeric_dtype: | ||
raise TypeError("cannot evaluate a numeric op {opstr} " | ||
"for type: {typ}".format( | ||
opstr=opstr, | ||
typ=type(self)) | ||
) | ||
opstr=opstr, | ||
typ=type(self)) | ||
) | ||
|
||
if isinstance(other, Index): | ||
if not other._is_numeric_dtype: | ||
raise TypeError("cannot evaluate a numeric op " | ||
"{opstr} with type: {typ}".format( | ||
opstr=type(self), | ||
typ=type(other)) | ||
) | ||
opstr=type(self), | ||
typ=type(other)) | ||
) | ||
elif isinstance(other, np.ndarray) and not other.ndim: | ||
other = other.item() | ||
|
||
|
@@ -3866,9 +3868,7 @@ def _add_numeric_methods_unary(cls): | |
""" add in numeric unary methods """ | ||
|
||
def _make_evaluate_unary(op, opstr): | ||
|
||
def _evaluate_numeric_unary(self): | ||
|
||
self._validate_for_numeric_unaryop(op, opstr) | ||
attrs = self._get_attributes_dict() | ||
attrs = self._maybe_update_attributes(attrs) | ||
|
@@ -3909,7 +3909,7 @@ def _make_logical_function(name, desc, f): | |
def logical_func(self, *args, **kwargs): | ||
result = f(self.values) | ||
if (isinstance(result, (np.ndarray, ABCSeries, Index)) and | ||
result.ndim == 0): | ||
result.ndim == 0): | ||
# return NumPy type | ||
return result.dtype.type(result.item()) | ||
else: # pragma: no cover | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -188,7 +188,6 @@ def test_constructor_ndarray_like(self): | |
# it should be possible to convert any object that satisfies the numpy | ||
# ndarray interface directly into an Index | ||
class ArrayLike(object): | ||
|
||
def __init__(self, array): | ||
self.array = array | ||
|
||
|
@@ -246,7 +245,6 @@ def test_index_ctor_infer_nan_nat(self): | |
[np.timedelta64('nat'), np.nan], | ||
[pd.NaT, np.timedelta64('nat')], | ||
[np.timedelta64('nat'), pd.NaT]]: | ||
|
||
tm.assert_index_equal(Index(data), exp) | ||
tm.assert_index_equal(Index(np.array(data, dtype=object)), exp) | ||
|
||
|
@@ -909,6 +907,13 @@ def test_symmetric_difference(self): | |
expected = MultiIndex.from_tuples([('bar', 2), ('baz', 3), ('bar', 3)]) | ||
assert tm.equalContents(result, expected) | ||
|
||
# on equal multiIndexs | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you move this to it's own dedicated test, and add a comment with a link to the github issue? |
||
idx1 = MultiIndex.from_tuples(self.tuples) | ||
idx2 = MultiIndex.from_tuples(self.tuples) | ||
result = idx1.symmetric_difference(idx2) | ||
expected = MultiIndex(levels=[[], []], labels=[[], []]) | ||
assert tm.equalContents(result, expected) | ||
|
||
# nans: | ||
# GH 13514 change: {nan} - {nan} == {} | ||
# (GH 6444, sorting of nans, is no longer an issue) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could use a comment explaining why we need the special case.