Skip to content

Commit

Permalink
DOC: update DF.set_index (pandas-dev#24762)
Browse files Browse the repository at this point in the history
  • Loading branch information
h-vetinari authored and jreback committed Jan 19, 2019
1 parent bd3c001 commit e984947
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 48 deletions.
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1147,8 +1147,8 @@ Other API Changes
- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel` (:issue:`22015`)
- :meth:`DataFrame.corr` and :meth:`Series.corr` now raise a ``ValueError`` along with a helpful error message instead of a ``KeyError`` when supplied with an invalid method (:issue:`22298`)
- :meth:`shift` will now always return a copy, instead of the previous behaviour of returning self when shifting by 0 (:issue:`22397`)
- :meth:`DataFrame.set_index` now allows all one-dimensional list-likes, raises a ``TypeError`` for incorrect types,
has an improved ``KeyError`` message, and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
- :meth:`DataFrame.set_index` now gives a better (and less frequent) KeyError, raises a ``ValueError`` for incorrect types,
and will not fail on duplicate column names with ``drop=True``. (:issue:`22484`)
- Slicing a single row of a DataFrame with multiple ExtensionArrays of the same type now preserves the dtype, rather than coercing to object (:issue:`22784`)
- :class:`DateOffset` attribute `_cacheable` and method `_should_cache` have been removed (:issue:`23118`)
- :meth:`Series.searchsorted`, when supplied a scalar value to search for, now returns a scalar instead of an array (:issue:`23801`).
Expand Down
64 changes: 39 additions & 25 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4042,12 +4042,16 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
Set the DataFrame index using existing columns.
Set the DataFrame index (row labels) using one or more existing
columns. The index can replace the existing index or expand on it.
columns or arrays (of the correct length). The index can replace the
existing index or expand on it.
Parameters
----------
keys : label or list of label
Name or names of the columns that will be used as the index.
keys : label or array-like or list of labels/arrays
This parameter can be either a single column key, a single array of
the same length as the calling DataFrame, or a list containing an
arbitrary combination of column keys and arrays. Here, "array"
encompasses :class:`Series`, :class:`Index` and ``np.ndarray``.
drop : bool, default True
Delete columns to be used as the new index.
append : bool, default False
Expand Down Expand Up @@ -4092,7 +4096,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
7 2013 84
10 2014 31
Create a multi-index using columns 'year' and 'month':
Create a MultiIndex using columns 'year' and 'month':
>>> df.set_index(['year', 'month'])
sale
Expand All @@ -4102,35 +4106,51 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
2013 7 84
2014 10 31
Create a multi-index using a set of values and a column:
Create a MultiIndex using an Index and a column:
>>> df.set_index([[1, 2, 3, 4], 'year'])
>>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
month sale
year
1 2012 1 55
2 2014 4 40
3 2013 7 84
4 2014 10 31
Create a MultiIndex using two Series:
>>> s = pd.Series([1, 2, 3, 4])
>>> df.set_index([s, s**2])
month year sale
1 1 1 2012 55
2 4 4 2014 40
3 9 7 2013 84
4 16 10 2014 31
"""
inplace = validate_bool_kwarg(inplace, 'inplace')
if not isinstance(keys, list):

err_msg = ('The parameter "keys" may be a column key, one-dimensional '
'array, or a list containing only valid column keys and '
'one-dimensional arrays.')

if (is_scalar(keys) or isinstance(keys, tuple)
or isinstance(keys, (ABCIndexClass, ABCSeries, np.ndarray))):
# make sure we have a container of keys/arrays we can iterate over
# tuples can appear as valid column keys!
keys = [keys]
elif not isinstance(keys, list):
raise ValueError(err_msg)

missing = []
for col in keys:
if (is_scalar(col) or isinstance(col, tuple)) and col in self:
# tuples can be both column keys or list-likes
# if they are valid column keys, everything is fine
continue
elif is_scalar(col) and col not in self:
# tuples that are not column keys are considered list-like,
# not considered missing
missing.append(col)
elif (not is_list_like(col, allow_sets=False)
if (is_scalar(col) or isinstance(col, tuple)):
# if col is a valid column key, everything is fine
# tuples are always considered keys, never as list-likes
if col not in self:
missing.append(col)
elif (not isinstance(col, (ABCIndexClass, ABCSeries,
np.ndarray, list))
or getattr(col, 'ndim', 1) > 1):
raise TypeError('The parameter "keys" may only contain a '
'combination of valid column keys and '
'one-dimensional list-likes')
raise ValueError(err_msg)

if missing:
raise KeyError('{}'.format(missing))
Expand Down Expand Up @@ -4163,12 +4183,6 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
elif isinstance(col, (list, np.ndarray)):
arrays.append(col)
names.append(None)
elif (is_list_like(col)
and not (isinstance(col, tuple) and col in self)):
# all other list-likes (but avoid valid column keys)
col = list(col) # ensure iterator do not get read twice etc.
arrays.append(col)
names.append(None)
# from here, col can only be a column label
else:
arrays.append(frame[col]._values)
Expand Down
53 changes: 32 additions & 21 deletions pandas/tests/frame/test_alter_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def test_set_index_after_mutation(self):
# Add list-of-list constructor because list is ambiguous -> lambda
# also test index name if append=True (name is duplicate here for B)
@pytest.mark.parametrize('box', [Series, Index, np.array,
list, tuple, iter, lambda x: [list(x)],
list, lambda x: [list(x)],
lambda x: MultiIndex.from_arrays([x])])
@pytest.mark.parametrize('append, index_name', [(True, None),
(True, 'B'), (True, 'test'), (False, None)])
Expand All @@ -135,7 +135,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols,
with pytest.raises(KeyError, match=msg):
df.set_index(key, drop=drop, append=append)
else:
# np.array/tuple/iter/list-of-list "forget" the name of B
# np.array/list-of-list "forget" the name of B
name_mi = getattr(key, 'names', None)
name = [getattr(key, 'name', None)] if name_mi is None else name_mi

Expand All @@ -150,8 +150,7 @@ def test_set_index_pass_single_array(self, frame_of_index_cols,

# MultiIndex constructor does not work directly on Series -> lambda
# also test index name if append=True (name is duplicate here for A & B)
@pytest.mark.parametrize('box', [Series, Index, np.array,
list, tuple, iter,
@pytest.mark.parametrize('box', [Series, Index, np.array, list,
lambda x: MultiIndex.from_arrays([x])])
@pytest.mark.parametrize('append, index_name',
[(True, None), (True, 'A'), (True, 'B'),
Expand All @@ -163,7 +162,7 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
df.index.name = index_name

keys = ['A', box(df['B'])]
# np.array/list/tuple/iter "forget" the name of B
# np.array/list "forget" the name of B
names = ['A', None if box in [np.array, list, tuple, iter] else 'B']

result = df.set_index(keys, drop=drop, append=append)
Expand All @@ -179,12 +178,10 @@ def test_set_index_pass_arrays(self, frame_of_index_cols,
# MultiIndex constructor does not work directly on Series -> lambda
# We also emulate a "constructor" for the label -> lambda
# also test index name if append=True (name is duplicate here for A)
@pytest.mark.parametrize('box2', [Series, Index, np.array,
list, tuple, iter,
@pytest.mark.parametrize('box2', [Series, Index, np.array, list,
lambda x: MultiIndex.from_arrays([x]),
lambda x: x.name])
@pytest.mark.parametrize('box1', [Series, Index, np.array,
list, tuple, iter,
@pytest.mark.parametrize('box1', [Series, Index, np.array, list,
lambda x: MultiIndex.from_arrays([x]),
lambda x: x.name])
@pytest.mark.parametrize('append, index_name', [(True, None),
Expand All @@ -198,17 +195,14 @@ def test_set_index_pass_arrays_duplicate(self, frame_of_index_cols, drop,
keys = [box1(df['A']), box2(df['A'])]
result = df.set_index(keys, drop=drop, append=append)

# if either box was iter, the content has been consumed; re-read it
keys = [box1(df['A']), box2(df['A'])]

# need to adapt first drop for case that both keys are 'A' --
# cannot drop the same column twice;
# use "is" because == would give ambiguous Boolean error for containers
first_drop = False if (keys[0] is 'A' and keys[1] is 'A') else drop

# to test against already-tested behaviour, we add sequentially,
# hence second append always True; must wrap keys in list, otherwise
# box = list would be illegal
# box = list would be interpreted as keys
expected = df.set_index([keys[0]], drop=first_drop, append=append)
expected = expected.set_index([keys[1]], drop=drop, append=True)
tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -238,7 +232,7 @@ def test_set_index_verify_integrity(self, frame_of_index_cols):

@pytest.mark.parametrize('append', [True, False])
@pytest.mark.parametrize('drop', [True, False])
def test_set_index_raise(self, frame_of_index_cols, drop, append):
def test_set_index_raise_keys(self, frame_of_index_cols, drop, append):
df = frame_of_index_cols

with pytest.raises(KeyError, match="['foo', 'bar', 'baz']"):
Expand All @@ -249,14 +243,31 @@ def test_set_index_raise(self, frame_of_index_cols, drop, append):
with pytest.raises(KeyError, match='X'):
df.set_index([df['A'], df['B'], 'X'], drop=drop, append=append)

msg = 'The parameter "keys" may only contain a combination of.*'
# forbidden type, e.g. set
with pytest.raises(TypeError, match=msg):
df.set_index(set(df['A']), drop=drop, append=append)
msg = "[('foo', 'foo', 'foo', 'bar', 'bar')]"
# tuples always raise KeyError
with pytest.raises(KeyError, match=msg):
df.set_index(tuple(df['A']), drop=drop, append=append)

# also within a list
with pytest.raises(KeyError, match=msg):
df.set_index(['A', df['A'], tuple(df['A'])],
drop=drop, append=append)

@pytest.mark.parametrize('append', [True, False])
@pytest.mark.parametrize('drop', [True, False])
@pytest.mark.parametrize('box', [set, iter])
def test_set_index_raise_on_type(self, frame_of_index_cols, box,
drop, append):
df = frame_of_index_cols

msg = 'The parameter "keys" may be a column key, .*'
# forbidden type, e.g. set/tuple/iter
with pytest.raises(ValueError, match=msg):
df.set_index(box(df['A']), drop=drop, append=append)

# forbidden type in list, e.g. set
with pytest.raises(TypeError, match=msg):
df.set_index(['A', df['A'], set(df['A'])],
# forbidden type in list, e.g. set/tuple/iter
with pytest.raises(ValueError, match=msg):
df.set_index(['A', df['A'], box(df['A'])],
drop=drop, append=append)

def test_construction_with_categorical_index(self):
Expand Down

0 comments on commit e984947

Please sign in to comment.