diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index f0ba1250b7f8d..5d925c1b235f8 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -752,7 +752,7 @@ Deprecations - The deprecated internal attributes ``_start``, ``_stop`` and ``_step`` of :class:`RangeIndex` now raise a ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`26581`) - The ``pandas.util.testing`` module has been deprecated. Use the public API in ``pandas.testing`` documented at :ref:`api.general.testing` (:issue:`16232`). - ``pandas.SparseArray`` has been deprecated. Use ``pandas.arrays.SparseArray`` (:class:`arrays.SparseArray`) instead. (:issue:`30642`) -- The parameter ``is_copy`` of :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`) +- The parameter ``is_copy`` of :meth:`Series.take` and :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`) - Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`) - The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`) - The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a5e134617b71a..f3a0cf3841b5b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2805,7 +2805,7 @@ def __getitem__(self, key): if getattr(indexer, "dtype", None) == bool: indexer = np.where(indexer)[0] - data = self.take(indexer, axis=1) + data = self._take_with_is_copy(indexer, axis=1) if is_single_key: # What does looking for a single key in a non-unique index return? @@ -2838,7 +2838,7 @@ def _getitem_bool_array(self, key): # be reindexed to match DataFrame rows key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] - return self.take(indexer, axis=0) + return self._take_with_is_copy(indexer, axis=0) def _getitem_multilevel(self, key): # self.columns is a MultiIndex diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bfeaf8bca48e9..a2e348bf98e33 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3211,8 +3211,11 @@ def take( axis : {0 or 'index', 1 or 'columns', None}, default 0 The axis on which to select elements. ``0`` means that we are selecting rows, ``1`` means that we are selecting columns. - is_copy : bool, default True - Whether to return a copy of the original object or not. + is_copy : bool + Before pandas 1.0, ``is_copy=False`` can be specified to ensure + that the return value is an actual copy. Starting with pandas 1.0, + ``take`` always returns a copy, and the keyword is therefore + deprecated. .. deprecated:: 1.0.0 **kwargs @@ -3276,12 +3279,10 @@ class max_speed if is_copy is not None: warnings.warn( "is_copy is deprecated and will be removed in a future version. " - "take will always return a copy in the future.", + "'take' always returns a copy, so there is no need to specify this.", FutureWarning, stacklevel=2, ) - else: - is_copy = True nv.validate_take(tuple(), kwargs) @@ -3290,13 +3291,22 @@ class max_speed new_data = self._data.take( indices, axis=self._get_block_manager_axis(axis), verify=True ) - result = self._constructor(new_data).__finalize__(self) + return self._constructor(new_data).__finalize__(self) - # Maybe set copy if we didn't actually change the index. - if is_copy: - if not result._get_axis(axis).equals(self._get_axis(axis)): - result._set_is_copy(self) + def _take_with_is_copy( + self: FrameOrSeries, indices, axis=0, **kwargs + ) -> FrameOrSeries: + """ + Internal version of the `take` method that sets the `_is_copy` + attribute to keep track of the parent dataframe (using in indexing + for the SettingWithCopyWarning). + See the docstring of `take` for full explanation of the parameters. + """ + result = self.take(indices=indices, axis=axis, **kwargs) + # Maybe set copy if we didn't actually change the index. + if not result._get_axis(axis).equals(self._get_axis(axis)): + result._set_is_copy(self) return result def xs(self, key, axis=0, level=None, drop_level: bool_t = True): @@ -3426,9 +3436,9 @@ class animal locomotion if isinstance(loc, np.ndarray): if loc.dtype == np.bool_: (inds,) = loc.nonzero() - return self.take(inds, axis=axis) + return self._take_with_is_copy(inds, axis=axis) else: - return self.take(loc, axis=axis) + return self._take_with_is_copy(loc, axis=axis) if not is_scalar(loc): new_index = self.index[loc] @@ -3485,7 +3495,7 @@ def _iget_item_cache(self, item): if ax.is_unique: lower = self._get_item_cache(ax[item]) else: - lower = self.take(item, axis=self._info_axis_number) + lower = self._take_with_is_copy(item, axis=self._info_axis_number) return lower def _box_item_values(self, key, values): @@ -7003,8 +7013,7 @@ def asof(self, where, subset=None): # mask the missing missing = locs == -1 - d = self.take(locs) - data = d.copy() + data = self.take(locs) data.index = where data.loc[missing] = np.nan return data if is_list else data.iloc[-1] @@ -7550,7 +7559,7 @@ def at_time( except AttributeError: raise TypeError("Index must be DatetimeIndex") - return self.take(indexer, axis=axis) + return self._take_with_is_copy(indexer, axis=axis) def between_time( self: FrameOrSeries, @@ -7632,7 +7641,7 @@ def between_time( except AttributeError: raise TypeError("Index must be DatetimeIndex") - return self.take(indexer, axis=axis) + return self._take_with_is_copy(indexer, axis=axis) def resample( self, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a8c96840ff17b..aa21aa452be95 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -685,7 +685,7 @@ def get_group(self, name, obj=None): if not len(inds): raise KeyError(name) - return obj.take(inds, axis=self.axis) + return obj._take_with_is_copy(inds, axis=self.axis) def __iter__(self): """ diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 4550be791b1ec..f88ad0287440d 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1509,7 +1509,7 @@ def _getitem_iterable(self, key, axis: int): # A boolean indexer key = check_bool_indexer(labels, key) (inds,) = key.nonzero() - return self.obj.take(inds, axis=axis) + return self.obj._take_with_is_copy(inds, axis=axis) else: # A collection of keys keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False) @@ -1694,7 +1694,7 @@ def _getbool_axis(self, key, axis: int): labels = self.obj._get_axis(axis) key = check_bool_indexer(labels, key) inds = key.nonzero()[0] - return self.obj.take(inds, axis=axis) + return self.obj._take_with_is_copy(inds, axis=axis) @Appender(IndexingMixin.loc.__doc__) @@ -2009,7 +2009,7 @@ def _get_list_axis(self, key, axis: int): `axis` can only be zero. """ try: - return self.obj.take(key, axis=axis) + return self.obj._take_with_is_copy(key, axis=axis) except IndexError: # re-raise with different error message raise IndexError("positional indexers are out-of-bounds") diff --git a/pandas/core/series.py b/pandas/core/series.py index e79404ccd9521..f061158a4d2e2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -786,7 +786,14 @@ def axes(self) -> List[Index]: # Indexing Methods @Appender(generic.NDFrame.take.__doc__) - def take(self, indices, axis=0, is_copy=False, **kwargs) -> "Series": + def take(self, indices, axis=0, is_copy=None, **kwargs) -> "Series": + if is_copy is not None: + warnings.warn( + "is_copy is deprecated and will be removed in a future version. " + "'take' always returns a copy, so there is no need to specify this.", + FutureWarning, + stacklevel=2, + ) nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) @@ -801,16 +808,20 @@ def take(self, indices, axis=0, is_copy=False, **kwargs) -> "Series": kwargs = {} new_values = self._values.take(indices, **kwargs) - result = self._constructor( + return self._constructor( new_values, index=new_index, fastpath=True ).__finalize__(self) - # Maybe set copy if we didn't actually change the index. - if is_copy: - if not result._get_axis(axis).equals(self._get_axis(axis)): - result._set_is_copy(self) + def _take_with_is_copy(self, indices, axis=0, **kwargs): + """ + Internal version of the `take` method that sets the `_is_copy` + attribute to keep track of the parent dataframe (using in indexing + for the SettingWithCopyWarning). For Series this does the same + as the public take (it never sets `_is_copy`). - return result + See the docstring of `take` for full explanation of the parameters. + """ + return self.take(indices=indices, axis=axis, **kwargs) def _ixs(self, i: int, axis: int = 0): """ diff --git a/pandas/tests/frame/methods/test_asof.py b/pandas/tests/frame/methods/test_asof.py index 0291be0a4083e..e2b417972638e 100644 --- a/pandas/tests/frame/methods/test_asof.py +++ b/pandas/tests/frame/methods/test_asof.py @@ -143,3 +143,16 @@ def test_time_zone_aware_index(self, stamp, expected): result = df.asof(stamp) tm.assert_series_equal(result, expected) + + def test_is_copy(self, date_range_frame): + # GH-27357, GH-30784: ensure the result of asof is an actual copy and + # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings + df = date_range_frame + N = 50 + df.loc[15:30, "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + + result = df.asof(dates) + + with tm.assert_produces_warning(None): + result["C"] = 1 diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index efb04c7f63c66..7645c6b4cf709 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -446,6 +446,15 @@ def test_sample_upsampling_without_replacement(self): with pytest.raises(ValueError, match=msg): df.sample(frac=2, replace=False) + def test_sample_is_copy(self): + # GH-27357, GH-30784: ensure the result of sample is an actual copy and + # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings + df = pd.DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"]) + df2 = df.sample(3) + + with tm.assert_produces_warning(None): + df2["d"] = 1 + def test_size_compat(self): # GH8846 # size property should be defined @@ -817,18 +826,23 @@ def test_take_invalid_kwargs(self): with pytest.raises(ValueError, match=msg): obj.take(indices, mode="clip") - def test_depr_take_kwarg_is_copy(self): + @pytest.mark.parametrize("is_copy", [True, False]) + def test_depr_take_kwarg_is_copy(self, is_copy): # GH 27357 df = DataFrame({"A": [1, 2, 3]}) msg = ( "is_copy is deprecated and will be removed in a future version. " - "take will always return a copy in the future." + "'take' always returns a copy, so there is no need to specify this." ) with tm.assert_produces_warning(FutureWarning) as w: - df.take([0, 1], is_copy=True) + df.take([0, 1], is_copy=is_copy) assert w[0].message.args[0] == msg + s = Series([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning): + s.take([0, 1], is_copy=is_copy) + def test_equals(self): s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) s2 = s1.copy()