From 65498c08005f8fe03015ce8448e8ce3780b2add4 Mon Sep 17 00:00:00 2001 From: h-vetinari <33685575+h-vetinari@users.noreply.github.com> Date: Sun, 3 Feb 2019 21:14:21 +0100 Subject: [PATCH] Backport PR #25085: Revert set_index inspection/error handling for 0.24.1 --- doc/source/whatsnew/v0.24.1.rst | 1 + pandas/core/frame.py | 55 ++++-------- pandas/tests/frame/test_alter_axes.py | 116 ++++++++++++++++++++++++-- 3 files changed, 130 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst index c66a990cd168c..be0a2eb682e87 100644 --- a/doc/source/whatsnew/v0.24.1.rst +++ b/doc/source/whatsnew/v0.24.1.rst @@ -58,6 +58,7 @@ Fixed Regressions - Fixed regression in :func:`merge` when merging an empty ``DataFrame`` with multiple timezone-aware columns on one of the timezone-aware columns (:issue:`25014`). - Fixed regression in :meth:`Series.rename_axis` and :meth:`DataFrame.rename_axis` where passing ``None`` failed to remove the axis name (:issue:`25034`) - Fixed regression in :func:`to_timedelta` with `box=False` incorrectly returning a ``datetime64`` object instead of a ``timedelta64`` object (:issue:`24961`) +- Fixed regression where custom hashable types could not be used as column keys in :meth:`DataFrame.set_index` (:issue:`24969`) .. _whatsnew_0241.bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5b462b949abf9..8fbfc8a06c1d0 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -71,7 +71,7 @@ is_iterator, is_sequence, is_named_tuple) -from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex +from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass from pandas.core.dtypes.missing import isna, notna from pandas.core import algorithms @@ -4137,33 +4137,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False, 4 16 10 2014 31 """ inplace = validate_bool_kwarg(inplace, 'inplace') - - err_msg = ('The parameter "keys" may be a column key, one-dimensional ' - 'array, or a list containing only valid column keys and ' - 'one-dimensional arrays.') - - if (is_scalar(keys) or isinstance(keys, tuple) - or isinstance(keys, (ABCIndexClass, ABCSeries, np.ndarray))): - # make sure we have a container of keys/arrays we can iterate over - # tuples can appear as valid column keys! + if not isinstance(keys, list): keys = [keys] - elif not isinstance(keys, list): - raise ValueError(err_msg) - - missing = [] - for col in keys: - if (is_scalar(col) or isinstance(col, tuple)): - # if col is a valid column key, everything is fine - # tuples are always considered keys, never as list-likes - if col not in self: - missing.append(col) - elif (not isinstance(col, (ABCIndexClass, ABCSeries, - np.ndarray, list)) - or getattr(col, 'ndim', 1) > 1): - raise ValueError(err_msg) - - if missing: - raise KeyError('{}'.format(missing)) if inplace: frame = self @@ -4174,7 +4149,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, names = [] if append: names = [x for x in self.index.names] - if isinstance(self.index, ABCMultiIndex): + if isinstance(self.index, MultiIndex): for i in range(self.index.nlevels): arrays.append(self.index._get_level_values(i)) else: @@ -4182,23 +4157,29 @@ def set_index(self, keys, drop=True, append=False, inplace=False, to_remove = [] for col in keys: - if isinstance(col, ABCMultiIndex): - for n in range(col.nlevels): + if isinstance(col, MultiIndex): + # append all but the last column so we don't have to modify + # the end of this loop + for n in range(col.nlevels - 1): arrays.append(col._get_level_values(n)) + + level = col._get_level_values(col.nlevels - 1) names.extend(col.names) - elif isinstance(col, (ABCIndexClass, ABCSeries)): - # if Index then not MultiIndex (treated above) - arrays.append(col) + elif isinstance(col, Series): + level = col._values + names.append(col.name) + elif isinstance(col, Index): + level = col names.append(col.name) - elif isinstance(col, (list, np.ndarray)): - arrays.append(col) + elif isinstance(col, (list, np.ndarray, Index)): + level = col names.append(None) - # from here, col can only be a column label else: - arrays.append(frame[col]._values) + level = frame[col]._values names.append(col) if drop: to_remove.append(col) + arrays.append(level) index = ensure_index_from_sequences(arrays, names) diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py index 2d1afa2281d44..cc3687f856b4e 100644 --- a/pandas/tests/frame/test_alter_axes.py +++ b/pandas/tests/frame/test_alter_axes.py @@ -253,23 +253,129 @@ def test_set_index_raise_keys(self, frame_of_index_cols, drop, append): df.set_index(['A', df['A'], tuple(df['A'])], drop=drop, append=append) + @pytest.mark.xfail(reason='broken due to revert, see GH 25085') @pytest.mark.parametrize('append', [True, False]) @pytest.mark.parametrize('drop', [True, False]) - @pytest.mark.parametrize('box', [set, iter]) + @pytest.mark.parametrize('box', [set, iter, lambda x: (y for y in x)], + ids=['set', 'iter', 'generator']) def test_set_index_raise_on_type(self, frame_of_index_cols, box, drop, append): df = frame_of_index_cols msg = 'The parameter "keys" may be a column key, .*' - # forbidden type, e.g. set/tuple/iter - with pytest.raises(ValueError, match=msg): + # forbidden type, e.g. set/iter/generator + with pytest.raises(TypeError, match=msg): df.set_index(box(df['A']), drop=drop, append=append) - # forbidden type in list, e.g. set/tuple/iter - with pytest.raises(ValueError, match=msg): + # forbidden type in list, e.g. set/iter/generator + with pytest.raises(TypeError, match=msg): df.set_index(['A', df['A'], box(df['A'])], drop=drop, append=append) + def test_set_index_custom_label_type(self): + # GH 24969 + + class Thing(object): + def __init__(self, name, color): + self.name = name + self.color = color + + def __str__(self): + return "" % (self.name,) + + # necessary for pretty KeyError + __repr__ = __str__ + + thing1 = Thing('One', 'red') + thing2 = Thing('Two', 'blue') + df = DataFrame({thing1: [0, 1], thing2: [2, 3]}) + expected = DataFrame({thing1: [0, 1]}, + index=Index([2, 3], name=thing2)) + + # use custom label directly + result = df.set_index(thing2) + tm.assert_frame_equal(result, expected) + + # custom label wrapped in list + result = df.set_index([thing2]) + tm.assert_frame_equal(result, expected) + + # missing key + thing3 = Thing('Three', 'pink') + msg = "" + with pytest.raises(KeyError, match=msg): + # missing label directly + df.set_index(thing3) + + with pytest.raises(KeyError, match=msg): + # missing label in list + df.set_index([thing3]) + + def test_set_index_custom_label_hashable_iterable(self): + # GH 24969 + + # actual example discussed in GH 24984 was e.g. for shapely.geometry + # objects (e.g. a collection of Points) that can be both hashable and + # iterable; using frozenset as a stand-in for testing here + + class Thing(frozenset): + # need to stabilize repr for KeyError (due to random order in sets) + def __repr__(self): + tmp = sorted(list(self)) + # double curly brace prints one brace in format string + return "frozenset({{{}}})".format(', '.join(map(repr, tmp))) + + thing1 = Thing(['One', 'red']) + thing2 = Thing(['Two', 'blue']) + df = DataFrame({thing1: [0, 1], thing2: [2, 3]}) + expected = DataFrame({thing1: [0, 1]}, + index=Index([2, 3], name=thing2)) + + # use custom label directly + result = df.set_index(thing2) + tm.assert_frame_equal(result, expected) + + # custom label wrapped in list + result = df.set_index([thing2]) + tm.assert_frame_equal(result, expected) + + # missing key + thing3 = Thing(['Three', 'pink']) + msg = '.*' # due to revert, see GH 25085 + with pytest.raises(KeyError, match=msg): + # missing label directly + df.set_index(thing3) + + with pytest.raises(KeyError, match=msg): + # missing label in list + df.set_index([thing3]) + + def test_set_index_custom_label_type_raises(self): + # GH 24969 + + # purposefully inherit from something unhashable + class Thing(set): + def __init__(self, name, color): + self.name = name + self.color = color + + def __str__(self): + return "" % (self.name,) + + thing1 = Thing('One', 'red') + thing2 = Thing('Two', 'blue') + df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2]) + + msg = 'unhashable type.*' + + with pytest.raises(TypeError, match=msg): + # use custom label directly + df.set_index(thing2) + + with pytest.raises(TypeError, match=msg): + # custom label wrapped in list + df.set_index([thing2]) + def test_construction_with_categorical_index(self): ci = tm.makeCategoricalIndex(10) ci.name = 'B'