DOC: update DF.set_index (pandas-dev#24762)

h-vetinari · Jan 19, 2019 · e984947 · e984947
1 parent bd3c001
commit e984947
Show file tree

Hide file tree

Showing 3 changed files with 73 additions and 48 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1147,8 +1147,8 @@ Other API Changes
 - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
 - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
 - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
-- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types,
-  has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
+- :meth:`DataFrame.set_index` now gives a better (and less frequent) KeyError, raises a ``ValueError`` for incorrect types,
+  and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
 - Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
 - :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
 - :meth:`Series.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23801`).

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4042,12 +4042,16 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         Set the DataFrame index using existing columns.
 
         Set the DataFrame index (row labels) using one or more existing
-        columns. The index can replace the existing index or expand on it.
+        columns or arrays (of the correct length). The index can replace the
+        existing index or expand on it.
 
         Parameters
         ----------
-        keys : label or list of label
-            Name or names of the columns that will be used as the index.
+        keys : label or array-like or list of labels/arrays
+            This parameter can be either a single column key, a single array of
+            the same length as the calling DataFrame, or a list containing an
+            arbitrary combination of column keys and arrays. Here, "array"
+            encompasses :class:`Series`, :class:`Index` and ``np.ndarray``.
         drop : bool, default True
             Delete columns to be used as the new index.
         append : bool, default False
@@ -4092,7 +4096,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         7      2013    84
         10     2014    31
 
-        Create a multi-index using columns 'year' and 'month':
+        Create a MultiIndex using columns 'year' and 'month':
 
         >>> df.set_index(['year', 'month'])
                     sale
@@ -4102,35 +4106,51 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         2013  7     84
         2014  10    31
 
-        Create a multi-index using a set of values and a column:
+        Create a MultiIndex using an Index and a column:
 
-        >>> df.set_index([[1, 2, 3, 4], 'year'])
+        >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
                  month  sale
            year
         1  2012  1      55
         2  2014  4      40
         3  2013  7      84
         4  2014  10     31
+
+        Create a MultiIndex using two Series:
+
+        >>> s = pd.Series([1, 2, 3, 4])
+        >>> df.set_index([s, s**2])
+              month  year  sale
+        1 1       1  2012    55
+        2 4       4  2014    40
+        3 9       7  2013    84
+        4 16     10  2014    31
         """
         inplace = validate_bool_kwarg(inplace, 'inplace')
-        if not isinstance(keys, list):
+
+        err_msg = ('The parameter "keys" may be a column key, one-dimensional '
+                   'array, or a list containing only valid column keys and '
+                   'one-dimensional arrays.')
+
+        if (is_scalar(keys) or isinstance(keys, tuple)
+                or isinstance(keys, (ABCIndexClass, ABCSeries, np.ndarray))):
+            # make sure we have a container of keys/arrays we can iterate over
+            # tuples can appear as valid column keys!
             keys = [keys]
+        elif not isinstance(keys, list):
+            raise ValueError(err_msg)
 
         missing = []
         for col in keys:
-            if (is_scalar(col) or isinstance(col, tuple)) and col in self:
-                # tuples can be both column keys or list-likes
-                # if they are valid column keys, everything is fine
-                continue
-            elif is_scalar(col) and col not in self:
-                # tuples that are not column keys are considered list-like,
-                # not considered missing
-                missing.append(col)
-            elif (not is_list_like(col, allow_sets=False)
+            if (is_scalar(col) or isinstance(col, tuple)):
+                # if col is a valid column key, everything is fine
+                # tuples are always considered keys, never as list-likes
+                if col not in self:
+                    missing.append(col)
+            elif (not isinstance(col, (ABCIndexClass, ABCSeries,
+                                       np.ndarray, list))
                   or getattr(col, 'ndim', 1) > 1):
-                raise TypeError('The parameter "keys" may only contain a '
-                                'combination of valid column keys and '
-                                'one-dimensional list-likes')
+                raise ValueError(err_msg)
 
         if missing:
             raise KeyError('{}'.format(missing))
@@ -4163,12 +4183,6 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
             elif isinstance(col, (list, np.ndarray)):
                 arrays.append(col)
                 names.append(None)
-            elif (is_list_like(col)
-                  and not (isinstance(col, tuple) and col in self)):
-                # all other list-likes (but avoid valid column keys)
-                col = list(col)  # ensure iterator do not get read twice etc.
-                arrays.append(col)
-                names.append(None)
             # from here, col can only be a column label
             else:
                 arrays.append(frame[col]._values)

diff --git a/pandas/tests/frame/test_alter_axes.py b/pandas/tests/frame/test_alter_axes.py
@@ -118,7 +118,7 @@ def test_set_index_after_mutation(self):
     # Add list-of-list constructor because list is ambiguous -> lambda
     # also test index name if append=True (name is duplicate here for B)
     @pytest.mark.parametrize('box', [Series, Index, np.array,
-                                     list, tuple, iter, lambda x: [list(x)],
+                                     list, lambda x: [list(x)],
                                      lambda x: MultiIndex.from_arrays([x])])
     @pytest.mark.parametrize('append, index_name', [(True, None),
                              (True, 'B'), (True, 'test'), (False, None)])
@@ -135,7 +135,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols,
             with pytest.raises(KeyError, match=msg):
                 df.set_index(key, drop=drop, append=append)
         else:
-            # np.array/tuple/iter/list-of-list "forget" the name of B
+            # np.array/list-of-list "forget" the name of B
             name_mi = getattr(key, 'names', None)
             name = [getattr(key, 'name', None)] if name_mi is None else name_mi
 
@@ -150,8 +150,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols,
 
     # MultiIndex constructor does not work directly on Series -> lambda
     # also test index name if append=True (name is duplicate here for A & B)
-    @pytest.mark.parametrize('box', [Series, Index, np.array,
-                                     list, tuple, iter,
+    @pytest.mark.parametrize('box', [Series, Index, np.array, list,
                                      lambda x: MultiIndex.from_arrays([x])])
     @pytest.mark.parametrize('append, index_name',
                              [(True, None), (True, 'A'), (True, 'B'),
@@ -163,7 +162,7 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
         df.index.name = index_name
 
         keys = ['A', box(df['B'])]
-        # np.array/list/tuple/iter "forget" the name of B
+        # np.array/list "forget" the name of B
         names = ['A', None if box in [np.array, list, tuple, iter] else 'B']
 
         result = df.set_index(keys, drop=drop, append=append)
@@ -179,12 +178,10 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
     # MultiIndex constructor does not work directly on Series -> lambda
     # We also emulate a "constructor" for the label -> lambda
     # also test index name if append=True (name is duplicate here for A)
-    @pytest.mark.parametrize('box2', [Series, Index, np.array,
-                                      list, tuple, iter,
+    @pytest.mark.parametrize('box2', [Series, Index, np.array, list,
                                       lambda x: MultiIndex.from_arrays([x]),
                                       lambda x: x.name])
-    @pytest.mark.parametrize('box1', [Series, Index, np.array,
-                                      list, tuple, iter,
+    @pytest.mark.parametrize('box1', [Series, Index, np.array, list,
                                       lambda x: MultiIndex.from_arrays([x]),
                                       lambda x: x.name])
     @pytest.mark.parametrize('append, index_name', [(True, None),
@@ -198,17 +195,14 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
         keys = [box1(df['A']), box2(df['A'])]
         result = df.set_index(keys, drop=drop, append=append)
 
-        # if either box was iter, the content has been consumed; re-read it
-        keys = [box1(df['A']), box2(df['A'])]
-
         # need to adapt first drop for case that both keys are 'A' --
         # cannot drop the same column twice;
         # use "is" because == would give ambiguous Boolean error for containers
         first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop
 
         # to test against already-tested behaviour, we add sequentially,
         # hence second append always True; must wrap keys in list, otherwise
-        # box = list would be illegal
+        # box = list would be interpreted as keys
         expected = df.set_index([keys[0]], drop=first_drop, append=append)
         expected = expected.set_index([keys[1]], drop=drop, append=True)
         tm.assert_frame_equal(result, expected)
@@ -238,7 +232,7 @@ def test_set_index_verify_integrity(self, frame_of_index_cols):
 
     @pytest.mark.parametrize('append', [True, False])
     @pytest.mark.parametrize('drop', [True, False])
-    def test_set_index_raise(self, frame_of_index_cols, drop, append):
+    def test_set_index_raise_keys(self, frame_of_index_cols, drop, append):
         df = frame_of_index_cols
 
         with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"):
@@ -249,14 +243,31 @@ def test_set_index_raise(self, frame_of_index_cols, drop, append):
         with pytest.raises(KeyError, match='X'):
             df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)
 
-        msg = 'The parameter "keys" may only contain a combination of.*'
-        # forbidden type, e.g. set
-        with pytest.raises(TypeError, match=msg):
-            df.set_index(set(df['A']), drop=drop, append=append)
+        msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]"
+        # tuples always raise KeyError
+        with pytest.raises(KeyError, match=msg):
+            df.set_index(tuple(df['A']), drop=drop, append=append)
+
+        # also within a list
+        with pytest.raises(KeyError, match=msg):
+            df.set_index(['A', df['A'], tuple(df['A'])],
+                         drop=drop, append=append)
+
+    @pytest.mark.parametrize('append', [True, False])
+    @pytest.mark.parametrize('drop', [True, False])
+    @pytest.mark.parametrize('box', [set, iter])
+    def test_set_index_raise_on_type(self, frame_of_index_cols, box,
+                                     drop, append):
+        df = frame_of_index_cols
+
+        msg = 'The parameter "keys" may be a column key, .*'
+        # forbidden type, e.g. set/tuple/iter
+        with pytest.raises(ValueError, match=msg):
+            df.set_index(box(df['A']), drop=drop, append=append)
 
-        # forbidden type in list, e.g. set
-        with pytest.raises(TypeError, match=msg):
-            df.set_index(['A', df['A'], set(df['A'])],
+        # forbidden type in list, e.g. set/tuple/iter
+        with pytest.raises(ValueError, match=msg):
+            df.set_index(['A', df['A'], box(df['A'])],
                          drop=drop, append=append)
 
     def test_construction_with_categorical_index(self):