From e532bc0de637917ea98b28c43ac62471778e0943 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 7 Jan 2020 16:40:43 +0100 Subject: [PATCH 1/9] API: DataFrame.take always returns a copy --- pandas/core/generic.py | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1edcf581c5131..f4f4b65d3a196 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3356,12 +3356,10 @@ class max_speed if is_copy is not None: warnings.warn( "is_copy is deprecated and will be removed in a future version. " - "take will always return a copy in the future.", + "take always returns a copy, so there is no need to specify this.", FutureWarning, stacklevel=2, ) - else: - is_copy = True nv.validate_take(tuple(), kwargs) @@ -3370,14 +3368,7 @@ class max_speed new_data = self._data.take( indices, axis=self._get_block_manager_axis(axis), verify=True ) - result = self._constructor(new_data).__finalize__(self) - - # Maybe set copy if we didn't actually change the index. - if is_copy: - if not result._get_axis(axis).equals(self._get_axis(axis)): - result._set_is_copy(self) - - return result + return self._constructor(new_data).__finalize__(self) def xs(self, key, axis=0, level=None, drop_level: bool_t = True): """ @@ -7022,8 +7013,7 @@ def asof(self, where, subset=None): # mask the missing missing = locs == -1 - d = self.take(locs) - data = d.copy() + data = self.take(locs) data.index = where data.loc[missing] = np.nan return data if is_list else data.iloc[-1] From 8c1709136b3578aca788c624ef754187e2b0b766 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 23 Jan 2020 15:20:27 +0100 Subject: [PATCH 2/9] add private _take_with_is_copy to use in indexing code --- pandas/core/frame.py | 4 ++-- pandas/core/generic.py | 28 ++++++++++++++++++++++------ pandas/core/groupby/groupby.py | 2 +- pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexing.py | 8 ++++---- 5 files changed, 30 insertions(+), 14 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 40a5ce25f4422..415e0847c8a30 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2797,7 +2797,7 @@ def __getitem__(self, key): if getattr(indexer, "dtype", None) == bool: indexer = np.where(indexer)[0] - data = self.take(indexer, axis=1) + data = self._take_with_is_copy(indexer, axis=1) if is_single_key: # What does looking for a single key in a non-unique index return? @@ -2830,7 +2830,7 @@ def _getitem_bool_array(self, key): # be reindexed to match DataFrame rows key = check_bool_indexer(self.index, key) indexer = key.nonzero()[0] - return self.take(indexer, axis=0) + return self._take_with_is_copy(indexer, axis=0) def _getitem_multilevel(self, key): # self.columns is a MultiIndex diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fc211ec911be6..bf8e89724b21d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3287,7 +3287,7 @@ class max_speed if is_copy is not None: warnings.warn( "is_copy is deprecated and will be removed in a future version. " - "take always returns a copy, so there is no need to specify this.", + "'take' always returns a copy, so there is no need to specify this.", FutureWarning, stacklevel=2, ) @@ -3301,6 +3301,22 @@ class max_speed ) return self._constructor(new_data).__finalize__(self) + def _take_with_is_copy( + self: FrameOrSeries, indices, axis=0, is_copy: bool_t = True, **kwargs + ) -> FrameOrSeries: + """ + Internal version of the `take` method that sets the `_is_copy` + attribute to keep track of the parent dataframe (using in indexing + for the SettingWithCopyWarning). + + See the docstring of `take` for full explanation of the parameters. + """ + result = self.take(indices=indices, axis=axis, **kwargs) + if is_copy: + if not result._get_axis(axis).equals(self._get_axis(axis)): + result._set_is_copy(self) + return result + def xs(self, key, axis=0, level=None, drop_level: bool_t = True): """ Return cross-section from the Series/DataFrame. @@ -3428,9 +3444,9 @@ class animal locomotion if isinstance(loc, np.ndarray): if loc.dtype == np.bool_: (inds,) = loc.nonzero() - return self.take(inds, axis=axis) + return self._take_with_is_copy(inds, axis=axis) else: - return self.take(loc, axis=axis) + return self._take_with_is_copy(loc, axis=axis) if not is_scalar(loc): new_index = self.index[loc] @@ -3487,7 +3503,7 @@ def _iget_item_cache(self, item): if ax.is_unique: lower = self._get_item_cache(ax[item]) else: - lower = self.take(item, axis=self._info_axis_number) + lower = self._take_with_is_copy(item, axis=self._info_axis_number) return lower def _box_item_values(self, key, values): @@ -7413,7 +7429,7 @@ def at_time( except AttributeError: raise TypeError("Index must be DatetimeIndex") - return self.take(indexer, axis=axis) + return self._take_with_is_copy(indexer, axis=axis) def between_time( self: FrameOrSeries, @@ -7495,7 +7511,7 @@ def between_time( except AttributeError: raise TypeError("Index must be DatetimeIndex") - return self.take(indexer, axis=axis) + return self._take_with_is_copy(indexer, axis=axis) def resample( self, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a8c96840ff17b..aa21aa452be95 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -685,7 +685,7 @@ def get_group(self, name, obj=None): if not len(inds): raise KeyError(name) - return obj.take(inds, axis=self.axis) + return obj._take_with_is_copy(inds, axis=self.axis) def __iter__(self): """ diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ee9b948a76ac8..1771fc5d8c4b6 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -647,7 +647,7 @@ def get_value(self, series, key): if isinstance(key, time): locs = self.indexer_at_time(key) - return series.take(locs) + return series._take_with_set_copy(locs) if isinstance(key, str): try: diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 63a86792082da..5cf4366f1eeed 100755 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1509,7 +1509,7 @@ def _getitem_iterable(self, key, axis: int): # A boolean indexer key = check_bool_indexer(labels, key) (inds,) = key.nonzero() - return self.obj.take(inds, axis=axis) + return self.obj._take_with_is_copy(inds, axis=axis) else: # A collection of keys keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False) @@ -1701,7 +1701,7 @@ def _getbool_axis(self, key, axis: int): labels = self.obj._get_axis(axis) key = check_bool_indexer(labels, key) inds = key.nonzero()[0] - return self.obj.take(inds, axis=axis) + return self.obj._take_with_is_copy(inds, axis=axis) def _get_slice_axis(self, slice_obj: slice, axis: int): """ @@ -1722,7 +1722,7 @@ def _get_slice_axis(self, slice_obj: slice, axis: int): else: # DatetimeIndex overrides Index.slice_indexer and may # return a DatetimeIndex instead of a slice object. - return self.obj.take(indexer, axis=axis) + return self.obj._take_with_is_copy(indexer, axis=axis) @Appender(IndexingMixin.loc.__doc__) @@ -2017,7 +2017,7 @@ def _get_list_axis(self, key, axis: int): `axis` can only be zero. """ try: - return self.obj.take(key, axis=axis) + return self.obj._take_with_is_copy(key, axis=axis) except IndexError: # re-raise with different error message raise IndexError("positional indexers are out-of-bounds") From d9e75f84717fa8f8a4af958ccb65b1819a754414 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 23 Jan 2020 15:31:17 +0100 Subject: [PATCH 3/9] also deprecate it for Series --- pandas/core/generic.py | 7 +++---- pandas/core/series.py | 25 ++++++++++++++++++------- pandas/tests/generic/test_generic.py | 11 ++++++++--- 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bf8e89724b21d..a7d04f3680f91 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3302,7 +3302,7 @@ class max_speed return self._constructor(new_data).__finalize__(self) def _take_with_is_copy( - self: FrameOrSeries, indices, axis=0, is_copy: bool_t = True, **kwargs + self: FrameOrSeries, indices, axis=0, **kwargs ) -> FrameOrSeries: """ Internal version of the `take` method that sets the `_is_copy` @@ -3312,9 +3312,8 @@ def _take_with_is_copy( See the docstring of `take` for full explanation of the parameters. """ result = self.take(indices=indices, axis=axis, **kwargs) - if is_copy: - if not result._get_axis(axis).equals(self._get_axis(axis)): - result._set_is_copy(self) + if not result._get_axis(axis).equals(self._get_axis(axis)): + result._set_is_copy(self) return result def xs(self, key, axis=0, level=None, drop_level: bool_t = True): diff --git a/pandas/core/series.py b/pandas/core/series.py index ffe0642f799fa..ef586e81f6da9 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -756,7 +756,14 @@ def axes(self) -> List[Index]: # Indexing Methods @Appender(generic.NDFrame.take.__doc__) - def take(self, indices, axis=0, is_copy=False, **kwargs) -> "Series": + def take(self, indices, axis=0, is_copy=None, **kwargs) -> "Series": + if is_copy is not None: + warnings.warn( + "is_copy is deprecated and will be removed in a future version. " + "'take' always returns a copy, so there is no need to specify this.", + FutureWarning, + stacklevel=2, + ) nv.validate_take(tuple(), kwargs) indices = ensure_platform_int(indices) @@ -771,16 +778,20 @@ def take(self, indices, axis=0, is_copy=False, **kwargs) -> "Series": kwargs = {} new_values = self._values.take(indices, **kwargs) - result = self._constructor( + return self._constructor( new_values, index=new_index, fastpath=True ).__finalize__(self) - # Maybe set copy if we didn't actually change the index. - if is_copy: - if not result._get_axis(axis).equals(self._get_axis(axis)): - result._set_is_copy(self) + def _take_with_is_copy(self, indices, axis=0, **kwargs): + """ + Internal version of the `take` method that sets the `_is_copy` + attribute to keep track of the parent dataframe (using in indexing + for the SettingWithCopyWarning). For Series this does the same + as the public take (it never sets `_is_copy`). - return result + See the docstring of `take` for full explanation of the parameters. + """ + return self.take(indices=indices, axis=axis, **kwargs) def _ixs(self, i: int, axis: int = 0): """ diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index efb04c7f63c66..760e7189b7af6 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -817,18 +817,23 @@ def test_take_invalid_kwargs(self): with pytest.raises(ValueError, match=msg): obj.take(indices, mode="clip") - def test_depr_take_kwarg_is_copy(self): + @pytest.mark.parametrize("is_copy", [True, False]) + def test_depr_take_kwarg_is_copy(self, is_copy): # GH 27357 df = DataFrame({"A": [1, 2, 3]}) msg = ( "is_copy is deprecated and will be removed in a future version. " - "take will always return a copy in the future." + "'take' always returns a copy, so there is no need to specify this." ) with tm.assert_produces_warning(FutureWarning) as w: - df.take([0, 1], is_copy=True) + df.take([0, 1], is_copy=is_copy) assert w[0].message.args[0] == msg + s = Series([1, 2, 3]) + with tm.assert_produces_warning(FutureWarning): + s.take([0, 1], is_copy=is_copy) + def test_equals(self): s1 = pd.Series([1, 2, 3], index=[0, 2, 1]) s2 = s1.copy() From e8e5cbba4ea60db964f712bdf5640dd3bf636288 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 23 Jan 2020 15:33:18 +0100 Subject: [PATCH 4/9] update whatsnew message --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index ec6ad38bbc7cf..843cfe6496996 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -723,7 +723,7 @@ Deprecations - The deprecated internal attributes ``_start``, ``_stop`` and ``_step`` of :class:`RangeIndex` now raise a ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`26581`) - The ``pandas.util.testing`` module has been deprecated. Use the public API in ``pandas.testing`` documented at :ref:`api.general.testing` (:issue:`16232`). - ``pandas.SparseArray`` has been deprecated. Use ``pandas.arrays.SparseArray`` (:class:`arrays.SparseArray`) instead. (:issue:`30642`) -- The parameter ``is_copy`` of :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`) +- The parameter ``is_copy`` of :meth:`DataFrame.Series` and :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`) - Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`) - The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`) - The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`) From 6c023d29f4d723cdfa5aeb8eb21bc803afa45d2d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 23 Jan 2020 15:46:58 +0100 Subject: [PATCH 5/9] fixup whatsnew --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 843cfe6496996..ecf5eb80aaa25 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -723,7 +723,7 @@ Deprecations - The deprecated internal attributes ``_start``, ``_stop`` and ``_step`` of :class:`RangeIndex` now raise a ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`26581`) - The ``pandas.util.testing`` module has been deprecated. Use the public API in ``pandas.testing`` documented at :ref:`api.general.testing` (:issue:`16232`). - ``pandas.SparseArray`` has been deprecated. Use ``pandas.arrays.SparseArray`` (:class:`arrays.SparseArray`) instead. (:issue:`30642`) -- The parameter ``is_copy`` of :meth:`DataFrame.Series` and :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`) +- The parameter ``is_copy`` of :meth:`Series.take` and :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`) - Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`) - The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`) - The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`) From d15639d88e4fd1539a76c9f49f036de29823e64d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 23 Jan 2020 16:48:26 +0100 Subject: [PATCH 6/9] fix typo --- pandas/core/indexes/datetimes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 1771fc5d8c4b6..68735ff0a14c2 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -647,7 +647,7 @@ def get_value(self, series, key): if isinstance(key, time): locs = self.indexer_at_time(key) - return series._take_with_set_copy(locs) + return series._take_with_is_copy(locs) if isinstance(key, str): try: From e519485f61235cb528e10c096fda6b9d2f794f21 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 24 Jan 2020 11:52:18 +0100 Subject: [PATCH 7/9] add tests for sample and asof --- pandas/tests/frame/methods/test_asof.py | 13 +++++++++++++ pandas/tests/generic/test_generic.py | 9 +++++++++ 2 files changed, 22 insertions(+) diff --git a/pandas/tests/frame/methods/test_asof.py b/pandas/tests/frame/methods/test_asof.py index 0291be0a4083e..e2b417972638e 100644 --- a/pandas/tests/frame/methods/test_asof.py +++ b/pandas/tests/frame/methods/test_asof.py @@ -143,3 +143,16 @@ def test_time_zone_aware_index(self, stamp, expected): result = df.asof(stamp) tm.assert_series_equal(result, expected) + + def test_is_copy(self, date_range_frame): + # GH-27357, GH-30784: ensure the result of asof is an actual copy and + # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings + df = date_range_frame + N = 50 + df.loc[15:30, "A"] = np.nan + dates = date_range("1/1/1990", periods=N * 3, freq="25s") + + result = df.asof(dates) + + with tm.assert_produces_warning(None): + result["C"] = 1 diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py index 760e7189b7af6..7645c6b4cf709 100644 --- a/pandas/tests/generic/test_generic.py +++ b/pandas/tests/generic/test_generic.py @@ -446,6 +446,15 @@ def test_sample_upsampling_without_replacement(self): with pytest.raises(ValueError, match=msg): df.sample(frac=2, replace=False) + def test_sample_is_copy(self): + # GH-27357, GH-30784: ensure the result of sample is an actual copy and + # doesn't track the parent dataframe / doesn't give SettingWithCopy warnings + df = pd.DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"]) + df2 = df.sample(3) + + with tm.assert_produces_warning(None): + df2["d"] = 1 + def test_size_compat(self): # GH8846 # size property should be defined From f10bd49bee3952c7e6af28147aac6cf0665edc7d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 26 Jan 2020 08:56:27 +0100 Subject: [PATCH 8/9] update docstring --- pandas/core/generic.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0bb8c6670843b..0a8327bebfc01 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3211,8 +3211,11 @@ def take( axis : {0 or 'index', 1 or 'columns', None}, default 0 The axis on which to select elements. ``0`` means that we are selecting rows, ``1`` means that we are selecting columns. - is_copy : bool, default True - Whether to return a copy of the original object or not. + is_copy : bool + Before pandas 1.0, ``is_copy=False`` can be specified to ensure + that the return value is an actual copy. Starting with pandas 1.0, + ``take`` always returns a copy, and the keyword is therefore + deprecated. .. deprecated:: 1.0.0 **kwargs From d3684f7d3c83e340fb9db6e81aa9769255ae14ce Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 26 Jan 2020 08:57:40 +0100 Subject: [PATCH 9/9] restore comment --- pandas/core/generic.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0a8327bebfc01..a2e348bf98e33 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3304,6 +3304,7 @@ def _take_with_is_copy( See the docstring of `take` for full explanation of the parameters. """ result = self.take(indices=indices, axis=axis, **kwargs) + # Maybe set copy if we didn't actually change the index. if not result._get_axis(axis).equals(self._get_axis(axis)): result._set_is_copy(self) return result