Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add set_index to Series #22225

Closed
wants to merge 18 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ Other Enhancements
- :meth:`DataFrame.to_sql` now supports writing ``TIMESTAMP WITH TIME ZONE`` types for supported databases. For databases that don't support timezones, datetime data will be stored as timezone unaware local timestamps. See the :ref:`io.sql_datetime_data` for implications (:issue:`9086`).
- :func:`to_timedelta` now supports iso-formated timedelta strings (:issue:`21877`)
- :class:`Series` and :class:`DataFrame` now support :class:`Iterable` in constructor (:issue:`2193`)
- :class:`Series` has gained the method :meth:`Series.set_index`, which works like its :class:`DataFrame` counterpart :meth:`DataFrame.set_index` (:issue:`21684`)
- :class:`DatetimeIndex` has gained the :attr:`DatetimeIndex.timetz` attribute. This returns the local time with timezone information. (:issue:`21358`)
- :meth:`Timestamp.round`, :meth:`Timestamp.ceil`, and :meth:`Timestamp.floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support an ``ambiguous`` argument for handling datetimes that are rounded to ambiguous times (:issue:`18946`)
- :meth:`Timestamp.round`, :meth:`Timestamp.ceil`, and :meth:`Timestamp.floor` for :class:`DatetimeIndex` and :class:`Timestamp` now support a ``nonexistent`` argument for handling datetimes that are rounded to nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`22647`)
Expand Down
139 changes: 5 additions & 134 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
is_iterator,
is_sequence,
is_named_tuple)
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass, ABCMultiIndex
from pandas.core.dtypes.generic import ABCSeries, ABCIndexClass
from pandas.core.dtypes.missing import isna, notna

from pandas.core import algorithms
Expand Down Expand Up @@ -4035,83 +4035,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
return super(DataFrame, self).shift(periods=periods, freq=freq,
axis=axis, fill_value=fill_value)

@Appender(NDFrame.set_index.__doc__)
def set_index(self, keys, drop=True, append=False, inplace=False,
verify_integrity=False):
"""
Set the DataFrame index using existing columns.

Set the DataFrame index (row labels) using one or more existing
columns. The index can replace the existing index or expand on it.

Parameters
----------
keys : label or list of label
Name or names of the columns that will be used as the index.
drop : bool, default True
Delete columns to be used as the new index.
append : bool, default False
Whether to append columns to existing index.
inplace : bool, default False
Modify the DataFrame in place (do not create a new object).
verify_integrity : bool, default False
Check the new index for duplicates. Otherwise defer the check until
necessary. Setting to False will improve the performance of this
method.

Returns
-------
DataFrame
Changed row labels.

See Also
--------
DataFrame.reset_index : Opposite of set_index.
DataFrame.reindex : Change to new indices or expand indices.
DataFrame.reindex_like : Change to same indices as other DataFrame.

Examples
--------
>>> df = pd.DataFrame({'month': [1, 4, 7, 10],
... 'year': [2012, 2014, 2013, 2014],
... 'sale': [55, 40, 84, 31]})
>>> df
month year sale
0 1 2012 55
1 4 2014 40
2 7 2013 84
3 10 2014 31

Set the index to become the 'month' column:

>>> df.set_index('month')
year sale
month
1 2012 55
4 2014 40
7 2013 84
10 2014 31

Create a multi-index using columns 'year' and 'month':

>>> df.set_index(['year', 'month'])
sale
year month
2012 1 55
2014 4 40
2013 7 84
2014 10 31

Create a multi-index using a set of values and a column:

>>> df.set_index([[1, 2, 3, 4], 'year'])
month sale
year
1 2012 1 55
2 2014 4 40
3 2013 7 84
4 2014 10 31
"""
inplace = validate_bool_kwarg(inplace, 'inplace')
if not isinstance(keys, list):
keys = [keys]

Expand All @@ -4134,65 +4061,9 @@ def set_index(self, keys, drop=True, append=False, inplace=False,
if missing:
raise KeyError('{}'.format(missing))

if inplace:
frame = self
else:
frame = self.copy()

arrays = []
names = []
if append:
names = [x for x in self.index.names]
if isinstance(self.index, ABCMultiIndex):
for i in range(self.index.nlevels):
arrays.append(self.index._get_level_values(i))
else:
arrays.append(self.index)

to_remove = []
for col in keys:
if isinstance(col, ABCMultiIndex):
for n in range(col.nlevels):
arrays.append(col._get_level_values(n))
names.extend(col.names)
elif isinstance(col, (ABCIndexClass, ABCSeries)):
# if Index then not MultiIndex (treated above)
arrays.append(col)
names.append(col.name)
elif isinstance(col, (list, np.ndarray)):
arrays.append(col)
names.append(None)
elif (is_list_like(col)
and not (isinstance(col, tuple) and col in self)):
# all other list-likes (but avoid valid column keys)
col = list(col) # ensure iterator do not get read twice etc.
arrays.append(col)
names.append(None)
# from here, col can only be a column label
else:
arrays.append(frame[col]._values)
names.append(col)
if drop:
to_remove.append(col)

index = ensure_index_from_sequences(arrays, names)

if verify_integrity and not index.is_unique:
duplicates = index[index.duplicated()].unique()
raise ValueError('Index has duplicate keys: {dup}'.format(
dup=duplicates))

# use set to handle duplicate column names gracefully in case of drop
for c in set(to_remove):
del frame[c]

# clear up memory usage
index._cleanup()

frame.index = index

if not inplace:
return frame
return super(DataFrame, self).set_index(
keys=keys, drop=drop, append=append, inplace=inplace,
verify_integrity=verify_integrity)

def reset_index(self, level=None, drop=False, inplace=False, col_level=0,
col_fill=''):
Expand Down
155 changes: 153 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
is_extension_array_dtype, is_integer, is_list_like, is_number,
is_numeric_dtype, is_object_dtype, is_period_arraylike, is_re_compilable,
is_scalar, is_timedelta64_dtype, pandas_dtype)
from pandas.core.dtypes.generic import ABCDataFrame, ABCPanel, ABCSeries
from pandas.core.dtypes.generic import (
ABCDataFrame, ABCIndexClass, ABCMultiIndex, ABCPanel, ABCSeries)
from pandas.core.dtypes.inference import is_hashable
from pandas.core.dtypes.missing import isna, notna

Expand All @@ -39,7 +40,8 @@
from pandas.core.base import PandasObject, SelectionMixin
import pandas.core.common as com
from pandas.core.index import (
Index, InvalidIndexError, MultiIndex, RangeIndex, ensure_index)
Index, InvalidIndexError, MultiIndex, RangeIndex, ensure_index,
ensure_index_from_sequences)
from pandas.core.indexes.datetimes import DatetimeIndex
from pandas.core.indexes.period import Period, PeriodIndex
import pandas.core.indexing as indexing
Expand Down Expand Up @@ -629,6 +631,155 @@ def _set_axis(self, axis, labels):
self._data.set_axis(axis, labels)
self._clear_item_cache()

_shared_docs['set_index'] = """
Set the index (row labels) using one or more given arrays (or labels).

Parameters
----------
%(params)s
append : bool, default False
Whether to append columns to existing index.
inplace : bool, default False
Modify the %(klass)s in place (do not create a new object).
verify_integrity : bool, default False
Check the new index for duplicates. Otherwise defer the check until
necessary. Setting to False will improve the performance of this
method.

Returns
-------
%(klass)s
The reindexed %(klass)s. Will be None if inplace is True.

See Also
--------
%(other_klass)s.set_index : Method adapted for %(other_klass)s.
%(klass)s.reset_index : Opposite of set_index.
%(klass)s.reindex : Change to new indices or expand indices.
%(klass)s.reindex_like : Change to same indices as other %(klass)s.

Examples
--------
%(examples)s
"""

@Substitution(
klass='DataFrame', other_klass='Series',
params=dedent("""\
keys : column label or list of column labels / arrays
Either a column label, Series, Index, MultiIndex, list, np.ndarray
or a list containing only column labels, Series, Index, MultiIndex,
list np.ndarray.
drop : bool, default True
Delete columns to be used as the new index."""),
examples=dedent("""\
>>> df = pd.DataFrame({'month': [1, 4, 7, 10],
... 'year': [2012, 2014, 2013, 2014],
... 'sale': [55, 40, 84, 31]})
>>> df
month year sale
0 1 2012 55
1 4 2014 40
2 7 2013 84
3 10 2014 31

Set the index to become the 'month' column:

>>> df.set_index('month')
year sale
month
1 2012 55
4 2014 40
7 2013 84
10 2014 31

Create a MultiIndex using columns 'year' and 'month':

>>> df.set_index(['year', 'month'])
sale
year month
2012 1 55
2014 4 40
2013 7 84
2014 10 31

Create a MultiIndex using a set of values and a column:

>>> df.set_index([[1, 2, 3, 4], 'year'])
month sale
year
1 2012 1 55
2 2014 4 40
3 2013 7 84
4 2014 10 31""")
)
@Appender(_shared_docs["set_index"])
def set_index(self, keys, drop=True, append=False, inplace=False,
verify_integrity=False):
# parameter keys is checked in Series.set_index / DataFrame.set_index,
# will always be passed as a list of list-likes!

inplace = validate_bool_kwarg(inplace, 'inplace')
if inplace:
obj = self
else:
obj = self.copy()

arrays = []
names = []
if append:
names = [x for x in self.index.names]
if isinstance(self.index, ABCMultiIndex):
for i in range(self.index.nlevels):
arrays.append(self.index._get_level_values(i))
else:
arrays.append(self.index)

to_remove = []
for col in keys:
if isinstance(col, ABCMultiIndex):
for n in range(col.nlevels):
arrays.append(col._get_level_values(n))
names.extend(col.names)
elif isinstance(col, (ABCIndexClass, ABCSeries)):
# if Index then not MultiIndex (treated above)
arrays.append(col)
names.append(col.name)
elif isinstance(col, (list, np.ndarray)):
arrays.append(col)
names.append(None)
elif (is_list_like(col)
and not (isinstance(col, tuple) and col in self)):
# all other list-likes (but avoid valid column keys)
col = list(col) # ensure iterator do not get read twice etc.
arrays.append(col)
names.append(None)
# from here, col can only be a column label
else:
arrays.append(obj[col]._values)
names.append(col)
if drop:
to_remove.append(col)

index = ensure_index_from_sequences(arrays, names)

if verify_integrity and not index.is_unique:
duplicates = list(index[index.duplicated()])
raise ValueError('Index has duplicate keys: {dup}'.format(
dup=duplicates))

# use set to handle duplicate column names gracefully in case of drop
for c in set(to_remove):
del obj[c]

# clear up memory usage
index._cleanup()

obj.index = index

if not inplace:
return obj

def transpose(self, *args, **kwargs):
"""
Permute the dimensions of the %(klass)s
Expand Down
Loading