API: better warnings for df.set_index

pandas-dev · Sep 15, 2018 · 229e72d · 229e72d
1 parent 3745576
commit 229e72d
Show file tree

Hide file tree

Showing 4 changed files with 674 additions and 423 deletions.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -545,6 +545,7 @@ Other API Changes
 - :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
 - :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
 - :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
+- :meth:`DataFrame.set_index` now raises a ``TypeError`` for incorrect types, has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
 
 .. _whatsnew_0240.deprecations:
 

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -60,6 +60,7 @@
     is_sequence,
     is_named_tuple)
 from pandas.core.dtypes.concat import _get_sliced_frame_result_type
+from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex
 from pandas.core.dtypes.missing import isna, notna
 
 
@@ -3892,6 +3893,22 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         if not isinstance(keys, list):
             keys = [keys]
 
+        missing = []
+        for x in keys:
+            if not (is_scalar(x) or isinstance(x, tuple)):
+                if not isinstance(x, (ABCSeries, ABCIndexClass, ABCMultiIndex,
+                                      list, np.ndarray)):
+                    raise TypeError('keys may only contain a combination of '
+                                    'the following: valid column keys, '
+                                    'Series, Index, MultiIndex, list or '
+                                    'np.ndarray')
+            else:
+                if x not in self:
+                    missing.append(x)
+
+        if missing:
+            raise KeyError('{}'.format(missing))
+
         if inplace:
             frame = self
         else:
@@ -3901,37 +3918,34 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
         names = []
         if append:
             names = [x for x in self.index.names]
-            if isinstance(self.index, MultiIndex):
+            if isinstance(self.index, ABCMultiIndex):
                 for i in range(self.index.nlevels):
                     arrays.append(self.index._get_level_values(i))
             else:
                 arrays.append(self.index)
 
         to_remove = []
         for col in keys:
-            if isinstance(col, MultiIndex):
-                # append all but the last column so we don't have to modify
-                # the end of this loop
-                for n in range(col.nlevels - 1):
+            if isinstance(col, ABCMultiIndex):
+                for n in range(col.nlevels):
                     arrays.append(col._get_level_values(n))
-
-                level = col._get_level_values(col.nlevels - 1)
                 names.extend(col.names)
-            elif isinstance(col, Series):
-                level = col._values
+            elif isinstance(col, ABCIndexClass):
+                # Index but not MultiIndex (treated above)
+                arrays.append(col)
                 names.append(col.name)
-            elif isinstance(col, Index):
-                level = col
+            elif isinstance(col, ABCSeries):
+                arrays.append(col._values)
                 names.append(col.name)
-            elif isinstance(col, (list, np.ndarray, Index)):
-                level = col
+            elif isinstance(col, (list, np.ndarray)):
+                arrays.append(col)
                 names.append(None)
+            # from here, col can only be a column label
             else:
-                level = frame[col]._values
+                arrays.append(frame[col]._values)
                 names.append(col)
                 if drop:
                     to_remove.append(col)
-            arrays.append(level)
 
         index = ensure_index_from_sequences(arrays, names)
 
@@ -3940,7 +3954,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
             raise ValueError('Index has duplicate keys: {dup}'.format(
                 dup=duplicates))
 
-        for c in to_remove:
+        # use set to handle duplicate column names gracefully in case of drop
+        for c in set(to_remove):
             del frame[c]
 
         # clear up memory usage

diff --git a/pandas/tests/frame/conftest.py b/pandas/tests/frame/conftest.py
@@ -0,0 +1,191 @@
+import pytest
+
+import numpy as np
+
+from pandas import compat
+import pandas.util.testing as tm
+from pandas import DataFrame, date_range, NaT
+
+
+@pytest.fixture
+def float_frame():
+    """
+    Fixture for DataFrame of floats with index of unique strings
+
+    Columns are ['A', 'B', 'C', 'D'].
+    """
+    return DataFrame(tm.getSeriesData())
+
+
+@pytest.fixture
+def float_frame2():
+    """
+    Fixture for DataFrame of floats with index of unique strings
+
+    Columns are ['D', 'C', 'B', 'A']
+    """
+    return DataFrame(tm.getSeriesData(), columns=['D', 'C', 'B', 'A'])
+
+
+@pytest.fixture
+def int_frame():
+    """
+    Fixture for DataFrame of ints with index of unique strings
+
+    Columns are ['A', 'B', 'C', 'D']
+    """
+    df = DataFrame({k: v.astype(int)
+                   for k, v in compat.iteritems(tm.getSeriesData())})
+    # force these all to int64 to avoid platform testing issues
+    return DataFrame({c: s for c, s in compat.iteritems(df)}, dtype=np.int64)
+
+
+@pytest.fixture
+def datetime_frame():
+    """
+    Fixture for DataFrame of floats with DatetimeIndex
+
+    Columns are ['A', 'B', 'C', 'D']
+    """
+    return DataFrame(tm.getTimeSeriesData())
+
+
+@pytest.fixture
+def float_string_frame():
+    """
+    Fixture for DataFrame of floats and strings with index of unique strings
+
+    Columns are ['A', 'B', 'C', 'D', 'foo'].
+    """
+    df = DataFrame(tm.getSeriesData())
+    df['foo'] = 'bar'
+    return df
+
+
+@pytest.fixture
+def mixed_float_frame():
+    """
+    Fixture for DataFrame of different float types with index of unique strings
+
+    Columns are ['A', 'B', 'C', 'D'].
+    """
+    df = DataFrame(tm.getSeriesData())
+    df.A = df.A.astype('float16')
+    df.B = df.B.astype('float32')
+    df.C = df.C.astype('float64')
+    return df
+
+
+@pytest.fixture
+def mixed_float_frame2():
+    """
+    Fixture for DataFrame of different float types with index of unique strings
+
+    Columns are ['A', 'B', 'C', 'D'].
+    """
+    df = DataFrame(tm.getSeriesData())
+    df.D = df.D.astype('float16')
+    df.C = df.C.astype('float32')
+    df.B = df.B.astype('float64')
+    return df
+
+
+@pytest.fixture
+def mixed_int_frame():
+    """
+    Fixture for DataFrame of different int types with index of unique strings
+
+    Columns are ['A', 'B', 'C', 'D'].
+    """
+    df = DataFrame({k: v.astype(int)
+                   for k, v in compat.iteritems(tm.getSeriesData())})
+    df.A = df.A.astype('uint8')
+    df.B = df.B.astype('int32')
+    df.C = df.C.astype('int64')
+    df.D = np.ones(len(df.D), dtype='uint64')
+    return df
+
+
+@pytest.fixture
+def mixed_type_frame():
+    """
+    Fixture for DataFrame of float/int/string columns with RangeIndex
+
+    Columns are ['a', 'b', 'c', 'float32', 'int32'].
+    """
+    return DataFrame({'a': 1., 'b': 2, 'c': 'foo',
+                      'float32': np.array([1.] * 10, dtype='float32'),
+                      'int32': np.array([1] * 10, dtype='int32')},
+                     index=np.arange(10))
+
+
+@pytest.fixture
+def timezone_frame():
+    """
+    Fixture for DataFrame of date_range Series with different time zones
+
+    Columns are ['A', 'B', 'C']; some entries are missing
+    """
+    df = DataFrame({'A': date_range('20130101', periods=3),
+                    'B': date_range('20130101', periods=3,
+                                    tz='US/Eastern'),
+                    'C': date_range('20130101', periods=3,
+                                    tz='CET')})
+    df.iloc[1, 1] = NaT
+    df.iloc[1, 2] = NaT
+    return df
+
+
+@pytest.fixture
+def empty_frame():
+    """
+    Fixture for empty DataFrame
+    """
+    return DataFrame({})
+
+
+@pytest.fixture
+def datetime_series():
+    """
+    Fixture for Series of floats with DatetimeIndex
+    """
+    return tm.makeTimeSeries(nper=30)
+
+
+@pytest.fixture
+def datetime_series_short():
+    """
+    Fixture for Series of floats with DatetimeIndex
+    """
+    return tm.makeTimeSeries(nper=30)[5:]
+
+
+@pytest.fixture
+def simple_frame():
+    """
+    Fixture for simple 3x3 DataFrame
+
+    Columns are ['one', 'two', 'three'], index is ['a', 'b', 'c'].
+    """
+    arr = np.array([[1., 2., 3.],
+                    [4., 5., 6.],
+                    [7., 8., 9.]])
+
+    return DataFrame(arr, columns=['one', 'two', 'three'],
+                     index=['a', 'b', 'c'])
+
+
+@pytest.fixture
+def frame_of_index_cols():
+    """
+    Fixture for DataFrame of columns that can be used for indexing
+
+    Columns are ['A', 'B', 'C', 'D', 'E']; 'A' & 'B' contain duplicates (but
+    are jointly unique), the rest are unique.
+    """
+    df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
+                    'B': ['one', 'two', 'three', 'one', 'two'],
+                    'C': ['a', 'b', 'c', 'd', 'e'],
+                    'D': np.random.randn(5),
+                    'E': np.random.randn(5)})
+    return df