pandas-dev · TomAugspurger · Feb 3, 2019 · Jan 26, 2019 · Jan 26, 2019 · Jan 28, 2019
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1195,8 +1195,6 @@ Other API Changes
 - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
 - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
 - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
-- :meth:`DataFrame.set_index` now gives a better (and less frequent) KeyError, raises a ``ValueError`` for incorrect types,
-  and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
 - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
 - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
 - :meth:`Series.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23801`).

diff --git a/doc/source/whatsnew/v0.24.1.rst b/doc/source/whatsnew/v0.24.1.rst
@@ -58,6 +58,7 @@ Fixed Regressions
 - Fixed regression in :func:`merge` when merging an empty ``DataFrame`` with multiple timezone-aware columns on one of the timezone-aware columns (:issue:`25014`).
 - Fixed regression in :meth:`Series.rename_axis` and :meth:`DataFrame.rename_axis` where passing ``None`` failed to remove the axis name (:issue:`25034`)
 - Fixed regression in :func:`to_timedelta` with `box=False` incorrectly returning a ``datetime64`` object instead of a ``timedelta64`` object (:issue:`24961`)
+- Fixed regression where custom hashable types could not be used as column keys in :meth:`DataFrame.set_index` (:issue:`24969`)
 
 .. _whatsnew_0241.bug_fixes:
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -71,7 +71,7 @@
     is_iterator,
     is_sequence,
     is_named_tuple)
-from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex
+from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass
 from pandas.core.dtypes.missing import isna, notna
 
 from pandas.core import algorithms
@@ -4138,33 +4138,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         4 16     10  2014    31
         """
         inplace = validate_bool_kwarg(inplace, 'inplace')
-
-        err_msg = ('The parameter "keys" may be a column key, one-dimensional '
-                   'array, or a list containing only valid column keys and '
-                   'one-dimensional arrays.')
-
-        if (is_scalar(keys) or isinstance(keys, tuple)
-                or isinstance(keys, (ABCIndexClass, ABCSeries, np.ndarray))):
-            # make sure we have a container of keys/arrays we can iterate over
-            # tuples can appear as valid column keys!
+        if not isinstance(keys, list):
             keys = [keys]
-        elif not isinstance(keys, list):
-            raise ValueError(err_msg)
-
-        missing = []
-        for col in keys:
-            if (is_scalar(col) or isinstance(col, tuple)):
-                # if col is a valid column key, everything is fine
-                # tuples are always considered keys, never as list-likes
-                if col not in self:
-                    missing.append(col)
-            elif (not isinstance(col, (ABCIndexClass, ABCSeries,
-                                       np.ndarray, list))
-                  or getattr(col, 'ndim', 1) > 1):
-                raise ValueError(err_msg)
-
-        if missing:
-            raise KeyError('{}'.format(missing))
 
         if inplace:
             frame = self
@@ -4175,31 +4150,37 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         names = []
         if append:
             names = [x for x in self.index.names]
-            if isinstance(self.index, ABCMultiIndex):
+            if isinstance(self.index, MultiIndex):
                 for i in range(self.index.nlevels):
                     arrays.append(self.index._get_level_values(i))
             else:
                 arrays.append(self.index)
 
         to_remove = []
         for col in keys:
-            if isinstance(col, ABCMultiIndex):
-                for n in range(col.nlevels):
+            if isinstance(col, MultiIndex):
+                # append all but the last column so we don't have to modify
+                # the end of this loop
+                for n in range(col.nlevels - 1):
                     arrays.append(col._get_level_values(n))
+
+                level = col._get_level_values(col.nlevels - 1)
                 names.extend(col.names)
-            elif isinstance(col, (ABCIndexClass, ABCSeries)):
-                # if Index then not MultiIndex (treated above)
-                arrays.append(col)
+            elif isinstance(col, Series):
+                level = col._values
+                names.append(col.name)
+            elif isinstance(col, Index):
+                level = col
                 names.append(col.name)
-            elif isinstance(col, (list, np.ndarray)):
-                arrays.append(col)
+            elif isinstance(col, (list, np.ndarray, Index)):
+                level = col
                 names.append(None)
-            # from here, col can only be a column label
             else:
-                arrays.append(frame[col]._values)
+                level = frame[col]._values
                 names.append(col)
                 if drop:
                     to_remove.append(col)
+            arrays.append(level)
 
         index = ensure_index_from_sequences(arrays, names)
 
@@ -4208,8 +4189,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
             raise ValueError('Index has duplicate keys: {dup}'.format(
                 dup=duplicates))
 
-        # use set to handle duplicate column names gracefully in case of drop
-        for c in set(to_remove):
+        for c in to_remove:
             del frame[c]
 
         # clear up memory usage

diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py
@@ -193,6 +193,11 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
         df.index.name = index_name
 
         keys = [box1(df['A']), box2(df['A'])]
+
+        # == gives ambiguous Boolean for Series
+        if drop and keys[0] is 'A' and keys[1] is 'A':
+            pytest.xfail(reason='broken due to reversion, see GH 25085')
+
         result = df.set_index(keys, drop=drop, append=append)
 
         # need to adapt first drop for case that both keys are 'A' --
@@ -253,23 +258,129 @@ def test_set_index_raise_keys(self, frame_of_index_cols, drop, append):
             df.set_index(['A', df['A'], tuple(df['A'])],
                          drop=drop, append=append)
 
+    @pytest.mark.xfail(reason='broken due to revert, see GH 25085')
     @pytest.mark.parametrize('append', [True, False])
     @pytest.mark.parametrize('drop', [True, False])
-    @pytest.mark.parametrize('box', [set, iter])
+    @pytest.mark.parametrize('box', [set, iter, lambda x: (y for y in x)],
+                             ids=['set', 'iter', 'generator'])
     def test_set_index_raise_on_type(self, frame_of_index_cols, box,
                                      drop, append):
         df = frame_of_index_cols
 
         msg = 'The parameter "keys" may be a column key, .*'
-        # forbidden type, e.g. set/tuple/iter
-        with pytest.raises(ValueError, match=msg):
+        # forbidden type, e.g. set/iter/generator
+        with pytest.raises(TypeError, match=msg):
             df.set_index(box(df['A']), drop=drop, append=append)
 
-        # forbidden type in list, e.g. set/tuple/iter
-        with pytest.raises(ValueError, match=msg):
+        # forbidden type in list, e.g. set/iter/generator
+        with pytest.raises(TypeError, match=msg):
             df.set_index(['A', df['A'], box(df['A'])],
                          drop=drop, append=append)
 
+    def test_set_index_custom_label_type(self):
+        # GH 24969
+
+        class Thing(object):
+            def __init__(self, name, color):
+                self.name = name
+                self.color = color
+
+            def __str__(self):
+                return "<Thing %r>" % (self.name,)
+
+            # necessary for pretty KeyError
+            __repr__ = __str__
+
+        thing1 = Thing('One', 'red')
+        thing2 = Thing('Two', 'blue')
+        df = DataFrame({thing1: [0, 1], thing2: [2, 3]})
+        expected = DataFrame({thing1: [0, 1]},
+                             index=Index([2, 3], name=thing2))
+
+        # use custom label directly
+        result = df.set_index(thing2)
+        tm.assert_frame_equal(result, expected)
+
+        # custom label wrapped in list
+        result = df.set_index([thing2])
+        tm.assert_frame_equal(result, expected)
+
+        # missing key
+        thing3 = Thing('Three', 'pink')
+        msg = "<Thing 'Three'>"
+        with pytest.raises(KeyError, match=msg):
+            # missing label directly
+            df.set_index(thing3)
+
+        with pytest.raises(KeyError, match=msg):
+            # missing label in list
+            df.set_index([thing3])
+
+    def test_set_index_custom_label_hashable_iterable(self):
+        # GH 24969
+
+        # actual example discussed in GH 24984 was e.g. for shapely.geometry
+        # objects (e.g. a collection of Points) that can be both hashable and
+        # iterable; using frozenset as a stand-in for testing here
+
+        class Thing(frozenset):
+            # need to stabilize repr for KeyError (due to random order in sets)
+            def __repr__(self):
+                tmp = sorted(list(self))
+                # double curly brace prints one brace in format string
+                return "frozenset({{{}}})".format(', '.join(map(repr, tmp)))
+
+        thing1 = Thing(['One', 'red'])
+        thing2 = Thing(['Two', 'blue'])
+        df = DataFrame({thing1: [0, 1], thing2: [2, 3]})
+        expected = DataFrame({thing1: [0, 1]},
+                             index=Index([2, 3], name=thing2))
+
+        # use custom label directly
+        result = df.set_index(thing2)
+        tm.assert_frame_equal(result, expected)
+
+        # custom label wrapped in list
+        result = df.set_index([thing2])
+        tm.assert_frame_equal(result, expected)
+
+        # missing key
+        thing3 = Thing(['Three', 'pink'])
+        msg = '.*'  # due to revert, see GH 25085
+        with pytest.raises(KeyError, match=msg):
+            # missing label directly
+            df.set_index(thing3)
+
+        with pytest.raises(KeyError, match=msg):
+            # missing label in list
+            df.set_index([thing3])
+
+    def test_set_index_custom_label_type_raises(self):
+        # GH 24969
+
+        # purposefully inherit from something unhashable
+        class Thing(set):
+            def __init__(self, name, color):
+                self.name = name
+                self.color = color
+
+            def __str__(self):
+                return "<Thing %r>" % (self.name,)
+
+        thing1 = Thing('One', 'red')
+        thing2 = Thing('Two', 'blue')
+        df = DataFrame([[0, 2], [1, 3]], columns=[thing1, thing2])
+
+        msg = 'unhashable type.*'
+
+        with pytest.raises(TypeError, match=msg):
+            # use custom label directly
+            df.set_index(thing2)
+
+        with pytest.raises(TypeError, match=msg):
+            # custom label wrapped in list
+            df.set_index([thing2])
+
     def test_construction_with_categorical_index(self):
         ci = tm.makeCategoricalIndex(10)
         ci.name = 'B'