Skip to content

Commit

Permalink
API: better error-handling for df.set_index (pandas-dev#22486)
Browse files Browse the repository at this point in the history
  • Loading branch information
h-vetinari authored and tm9k1 committed Nov 19, 2018
1 parent 5ed86ce commit b0966b3
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 50 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -806,6 +806,8 @@ Other API Changes
- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types,
has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)

Expand Down
55 changes: 38 additions & 17 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
is_sequence,
is_named_tuple)
from pandas.core.dtypes.concat import _get_sliced_frame_result_type
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex
from pandas.core.dtypes.missing import isna, notna

from pandas.core import algorithms
Expand Down Expand Up @@ -3988,6 +3989,25 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
if not isinstance(keys, list):
keys = [keys]

missing = []
for col in keys:
if (is_scalar(col) or isinstance(col, tuple)) and col in self:
# tuples can be both column keys or list-likes
# if they are valid column keys, everything is fine
continue
elif is_scalar(col) and col not in self:
# tuples that are not column keys are considered list-like,
# not considered missing
missing.append(col)
elif (not is_list_like(col, allow_sets=False)
or getattr(col, 'ndim', 1) > 1):
raise TypeError('The parameter "keys" may only contain a '
'combination of valid column keys and '
'one-dimensional list-likes')

if missing:
raise KeyError('{}'.format(missing))

if inplace:
frame = self
else:
Expand All @@ -3997,37 +4017,37 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
names = []
if append:
names = [x for x in self.index.names]
if isinstance(self.index, MultiIndex):
if isinstance(self.index, ABCMultiIndex):
for i in range(self.index.nlevels):
arrays.append(self.index._get_level_values(i))
else:
arrays.append(self.index)

to_remove = []
for col in keys:
if isinstance(col, MultiIndex):
# append all but the last column so we don't have to modify
# the end of this loop
for n in range(col.nlevels - 1):
if isinstance(col, ABCMultiIndex):
for n in range(col.nlevels):
arrays.append(col._get_level_values(n))

level = col._get_level_values(col.nlevels - 1)
names.extend(col.names)
elif isinstance(col, Series):
level = col._values
names.append(col.name)
elif isinstance(col, Index):
level = col
elif isinstance(col, (ABCIndexClass, ABCSeries)):
# if Index then not MultiIndex (treated above)
arrays.append(col)
names.append(col.name)
elif isinstance(col, (list, np.ndarray, Index)):
level = col
elif isinstance(col, (list, np.ndarray)):
arrays.append(col)
names.append(None)
elif (is_list_like(col)
and not (isinstance(col, tuple) and col in self)):
# all other list-likes (but avoid valid column keys)
col = list(col) # ensure iterator do not get read twice etc.
arrays.append(col)
names.append(None)
# from here, col can only be a column label
else:
level = frame[col]._values
arrays.append(frame[col]._values)
names.append(col)
if drop:
to_remove.append(col)
arrays.append(level)

index = ensure_index_from_sequences(arrays, names)

Expand All @@ -4036,7 +4056,8 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
raise ValueError('Index has duplicate keys: {dup}'.format(
dup=duplicates))

for c in to_remove:
# use set to handle duplicate column names gracefully in case of drop
for c in set(to_remove):
del frame[c]

# clear up memory usage
Expand Down
7 changes: 4 additions & 3 deletions pandas/tests/frame/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,12 +211,13 @@ def frame_of_index_cols():
"""
Fixture for DataFrame of columns that can be used for indexing
Columns are ['A', 'B', 'C', 'D', 'E']; 'A' & 'B' contain duplicates (but
are jointly unique), the rest are unique.
Columns are ['A', 'B', 'C', 'D', 'E', ('tuple', 'as', 'label')];
'A' & 'B' contain duplicates (but are jointly unique), the rest are unique.
"""
df = DataFrame({'A': ['foo', 'foo', 'foo', 'bar', 'bar'],
'B': ['one', 'two', 'three', 'one', 'two'],
'C': ['a', 'b', 'c', 'd', 'e'],
'D': np.random.randn(5),
'E': np.random.randn(5)})
'E': np.random.randn(5),
('tuple', 'as', 'label'): np.random.randn(5)})
return df
87 changes: 57 additions & 30 deletions pandas/tests/frame/test_alter_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def test_set_index_cast(self):
tm.assert_frame_equal(df, df2)

# A has duplicate values, C does not
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
('tuple', 'as', 'label')])
@pytest.mark.parametrize('inplace', [True, False])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_drop_inplace(self, frame_of_index_cols,
Expand All @@ -72,7 +73,8 @@ def test_set_index_drop_inplace(self, frame_of_index_cols,
tm.assert_frame_equal(result, expected)

# A has duplicate values, C does not
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
('tuple', 'as', 'label')])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_append(self, frame_of_index_cols, drop, keys):
df = frame_of_index_cols
Expand All @@ -88,7 +90,8 @@ def test_set_index_append(self, frame_of_index_cols, drop, keys):
tm.assert_frame_equal(result, expected)

# A has duplicate values, C does not
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B']])
@pytest.mark.parametrize('keys', ['A', 'C', ['A', 'B'],
('tuple', 'as', 'label')])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_append_to_multiindex(self, frame_of_index_cols,
drop, keys):
Expand All @@ -114,8 +117,10 @@ def test_set_index_after_mutation(self):
tm.assert_frame_equal(result, expected)

# MultiIndex constructor does not work directly on Series -> lambda
# Add list-of-list constructor because list is ambiguous -> lambda
# also test index name if append=True (name is duplicate here for B)
@pytest.mark.parametrize('box', [Series, Index, np.array,
list, tuple, iter, lambda x: [list(x)],
lambda x: MultiIndex.from_arrays([x])])
@pytest.mark.parametrize('append, index_name', [(True, None),
(True, 'B'), (True, 'test'), (False, None)])
Expand All @@ -126,21 +131,29 @@ def test_set_index_pass_single_array(self, frame_of_index_cols,
df.index.name = index_name

key = box(df['B'])
# np.array and list "forget" the name of B
name = [None if box in [np.array, list] else 'B']
if box == list:
# list of strings gets interpreted as list of keys
msg = "['one', 'two', 'three', 'one', 'two']"
with tm.assert_raises_regex(KeyError, msg):
df.set_index(key, drop=drop, append=append)
else:
# np.array/tuple/iter/list-of-list "forget" the name of B
name_mi = getattr(key, 'names', None)
name = [getattr(key, 'name', None)] if name_mi is None else name_mi

result = df.set_index(key, drop=drop, append=append)
result = df.set_index(key, drop=drop, append=append)

# only valid column keys are dropped
# since B is always passed as array above, nothing is dropped
expected = df.set_index(['B'], drop=False, append=append)
expected.index.names = [index_name] + name if append else name
# only valid column keys are dropped
# since B is always passed as array above, nothing is dropped
expected = df.set_index(['B'], drop=False, append=append)
expected.index.names = [index_name] + name if append else name

tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected)

# MultiIndex constructor does not work directly on Series -> lambda
# also test index name if append=True (name is duplicate here for A & B)
@pytest.mark.parametrize('box', [Series, Index, np.array, list,
@pytest.mark.parametrize('box', [Series, Index, np.array,
list, tuple, iter,
lambda x: MultiIndex.from_arrays([x])])
@pytest.mark.parametrize('append, index_name',
[(True, None), (True, 'A'), (True, 'B'),
Expand All @@ -152,8 +165,8 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
df.index.name = index_name

keys = ['A', box(df['B'])]
# np.array and list "forget" the name of B
names = ['A', None if box in [np.array, list] else 'B']
# np.array/list/tuple/iter "forget" the name of B
names = ['A', None if box in [np.array, list, tuple, iter] else 'B']

result = df.set_index(keys, drop=drop, append=append)

Expand All @@ -168,10 +181,12 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
# MultiIndex constructor does not work directly on Series -> lambda
# We also emulate a "constructor" for the label -> lambda
# also test index name if append=True (name is duplicate here for A)
@pytest.mark.parametrize('box2', [Series, Index, np.array, list,
@pytest.mark.parametrize('box2', [Series, Index, np.array,
list, tuple, iter,
lambda x: MultiIndex.from_arrays([x]),
lambda x: x.name])
@pytest.mark.parametrize('box1', [Series, Index, np.array, list,
@pytest.mark.parametrize('box1', [Series, Index, np.array,
list, tuple, iter,
lambda x: MultiIndex.from_arrays([x]),
lambda x: x.name])
@pytest.mark.parametrize('append, index_name', [(True, None),
Expand All @@ -183,21 +198,22 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
df.index.name = index_name

keys = [box1(df['A']), box2(df['A'])]
result = df.set_index(keys, drop=drop, append=append)

# == gives ambiguous Boolean for Series
if drop and keys[0] is 'A' and keys[1] is 'A':
with tm.assert_raises_regex(KeyError, '.*'):
df.set_index(keys, drop=drop, append=append)
else:
result = df.set_index(keys, drop=drop, append=append)
# if either box was iter, the content has been consumed; re-read it
keys = [box1(df['A']), box2(df['A'])]

# to test against already-tested behavior, we add sequentially,
# hence second append always True; must wrap in list, otherwise
# list-box will be illegal
expected = df.set_index([keys[0]], drop=drop, append=append)
expected = expected.set_index([keys[1]], drop=drop, append=True)
# need to adapt first drop for case that both keys are 'A' --
# cannot drop the same column twice;
# use "is" because == would give ambiguous Boolean error for containers
first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop

tm.assert_frame_equal(result, expected)
# to test against already-tested behaviour, we add sequentially,
# hence second append always True; must wrap keys in list, otherwise
# box = list would be illegal
expected = df.set_index([keys[0]], drop=first_drop, append=append)
expected = expected.set_index([keys[1]], drop=drop, append=True)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize('append', [True, False])
@pytest.mark.parametrize('drop', [True, False])
Expand Down Expand Up @@ -229,13 +245,24 @@ def test_set_index_verify_integrity(self, frame_of_index_cols):
def test_set_index_raise(self, frame_of_index_cols, drop, append):
df = frame_of_index_cols

with tm.assert_raises_regex(KeyError, '.*'): # column names are A-E
with tm.assert_raises_regex(KeyError, "['foo', 'bar', 'baz']"):
# column names are A-E, as well as one tuple
df.set_index(['foo', 'bar', 'baz'], drop=drop, append=append)

# non-existent key in list with arrays
with tm.assert_raises_regex(KeyError, '.*'):
with tm.assert_raises_regex(KeyError, 'X'):
df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)

msg = 'The parameter "keys" may only contain a combination of.*'
# forbidden type, e.g. set
with tm.assert_raises_regex(TypeError, msg):
df.set_index(set(df['A']), drop=drop, append=append)

# forbidden type in list, e.g. set
with tm.assert_raises_regex(TypeError, msg):
df.set_index(['A', df['A'], set(df['A'])],
drop=drop, append=append)

def test_construction_with_categorical_index(self):
ci = tm.makeCategoricalIndex(10)
ci.name = 'B'
Expand Down

0 comments on commit b0966b3

Please sign in to comment.