From 9716fcb23b8b8d64189523621fba330c1d69aa4d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 12 Aug 2022 11:17:08 -0700 Subject: [PATCH] ENH: set_index copy kwd (#48043) * ENH: set_index copy kwd * GH ref * mypy fixup --- doc/source/whatsnew/v1.5.0.rst | 1 + pandas/core/frame.py | 17 ++++++++++++++++- pandas/tests/frame/methods/test_set_index.py | 19 +++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 48ca1fd9d16eb..b71d294b97f9a 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -293,6 +293,7 @@ Other enhancements - :meth:`RangeIndex.union` now can return a :class:`RangeIndex` instead of a :class:`Int64Index` if the resulting values are equally spaced (:issue:`47557`, :issue:`43885`) - :meth:`DataFrame.compare` now accepts an argument ``result_names`` to allow the user to specify the result's names of both left and right DataFrame which are being compared. This is by default ``'self'`` and ``'other'`` (:issue:`44354`) - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support a ``copy`` argument. If ``False``, the underlying data is not copied in the returned object (:issue:`47934`) +- :meth:`DataFrame.set_index` now supports a ``copy`` keyword. If ``False``, the underlying data is not copied when a new :class:`DataFrame` is returned (:issue:`48043`) .. --------------------------------------------------------------------------- .. _whatsnew_150.notable_bug_fixes: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0a7a6494d04eb..8c4924a2483be 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5824,6 +5824,7 @@ def set_index( append: bool = ..., inplace: Literal[False] = ..., verify_integrity: bool = ..., + copy: bool | lib.NoDefault = ..., ) -> DataFrame: ... @@ -5836,6 +5837,7 @@ def set_index( append: bool = ..., inplace: Literal[True], verify_integrity: bool = ..., + copy: bool | lib.NoDefault = ..., ) -> None: ... @@ -5847,6 +5849,7 @@ def set_index( append: bool = False, inplace: bool = False, verify_integrity: bool = False, + copy: bool | lib.NoDefault = lib.no_default, ) -> DataFrame | None: """ Set the DataFrame index using existing columns. @@ -5873,6 +5876,11 @@ def set_index( Check the new index for duplicates. Otherwise defer the check until necessary. Setting to False will improve the performance of this method. + copy : bool, default True + Whether to make a copy of the underlying data when returning a new + DataFrame. + + .. versionadded:: 1.5.0 Returns ------- @@ -5938,6 +5946,13 @@ def set_index( 4 16 10 2014 31 """ inplace = validate_bool_kwarg(inplace, "inplace") + if inplace: + if copy is not lib.no_default: + raise ValueError("Cannot specify copy when inplace=True") + copy = False + elif copy is lib.no_default: + copy = True + self._check_inplace_and_allows_duplicate_labels(inplace) if not isinstance(keys, list): keys = [keys] @@ -5973,7 +5988,7 @@ def set_index( if inplace: frame = self else: - frame = self.copy() + frame = self.copy(deep=copy) arrays = [] names: list[Hashable] = [] diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py index 4c39cf99f18ff..9392d3c146942 100644 --- a/pandas/tests/frame/methods/test_set_index.py +++ b/pandas/tests/frame/methods/test_set_index.py @@ -25,6 +25,25 @@ class TestSetIndex: + def test_set_index_copy(self): + # GH#48043 + df = DataFrame({"A": [1, 2], "B": [3, 4], "C": [5, 6]}) + expected = DataFrame({"B": [3, 4], "C": [5, 6]}, index=Index([1, 2], name="A")) + + res = df.set_index("A", copy=True) + tm.assert_frame_equal(res, expected) + assert not any(tm.shares_memory(df[col], res[col]) for col in res.columns) + + res = df.set_index("A", copy=False) + tm.assert_frame_equal(res, expected) + assert all(tm.shares_memory(df[col], res[col]) for col in res.columns) + + msg = "Cannot specify copy when inplace=True" + with pytest.raises(ValueError, match=msg): + df.set_index("A", inplace=True, copy=True) + with pytest.raises(ValueError, match=msg): + df.set_index("A", inplace=True, copy=False) + def test_set_index_multiindex(self): # segfault in GH#3308 d = {"t1": [2, 2.5, 3], "t2": [4, 5, 6]}