Skip to content

Commit

Permalink
API: DataFrame.take always returns a copy (pandas-dev#30784)
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche committed Jan 27, 2020
1 parent 6b83628 commit 495abdf
Show file tree
Hide file tree
Showing 8 changed files with 81 additions and 34 deletions.
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -753,7 +753,7 @@ Deprecations
- The deprecated internal attributes ``_start``, ``_stop`` and ``_step`` of :class:`RangeIndex` now raise a ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`26581`)
- The ``pandas.util.testing`` module has been deprecated. Use the public API in ``pandas.testing`` documented at :ref:`api.general.testing` (:issue:`16232`).
- ``pandas.SparseArray`` has been deprecated. Use ``pandas.arrays.SparseArray`` (:class:`arrays.SparseArray`) instead. (:issue:`30642`)
- The parameter ``is_copy`` of :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`)
- The parameter ``is_copy`` of :meth:`Series.take` and :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`)
- Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`)
- The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`)
- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2809,7 +2809,7 @@ def __getitem__(self, key):
if getattr(indexer, "dtype", None) == bool:
indexer = np.where(indexer)[0]

data = self.take(indexer, axis=1)
data = self._take_with_is_copy(indexer, axis=1)

if is_single_key:
# What does looking for a single key in a non-unique index return?
Expand Down Expand Up @@ -2842,7 +2842,7 @@ def _getitem_bool_array(self, key):
# be reindexed to match DataFrame rows
key = check_bool_indexer(self.index, key)
indexer = key.nonzero()[0]
return self.take(indexer, axis=0)
return self._take_with_is_copy(indexer, axis=0)

def _getitem_multilevel(self, key):
# self.columns is a MultiIndex
Expand Down
43 changes: 26 additions & 17 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3313,8 +3313,11 @@ def take(
axis : {0 or 'index', 1 or 'columns', None}, default 0
The axis on which to select elements. ``0`` means that we are
selecting rows, ``1`` means that we are selecting columns.
is_copy : bool, default True
Whether to return a copy of the original object or not.
is_copy : bool
Before pandas 1.0, ``is_copy=False`` can be specified to ensure
that the return value is an actual copy. Starting with pandas 1.0,
``take`` always returns a copy, and the keyword is therefore
deprecated.
.. deprecated:: 1.0.0
**kwargs
Expand Down Expand Up @@ -3378,12 +3381,10 @@ class max_speed
if is_copy is not None:
warnings.warn(
"is_copy is deprecated and will be removed in a future version. "
"take will always return a copy in the future.",
"'take' always returns a copy, so there is no need to specify this.",
FutureWarning,
stacklevel=2,
)
else:
is_copy = True

nv.validate_take(tuple(), kwargs)

Expand All @@ -3392,13 +3393,22 @@ class max_speed
new_data = self._data.take(
indices, axis=self._get_block_manager_axis(axis), verify=True
)
result = self._constructor(new_data).__finalize__(self)
return self._constructor(new_data).__finalize__(self)

# Maybe set copy if we didn't actually change the index.
if is_copy:
if not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
def _take_with_is_copy(
self: FrameOrSeries, indices, axis=0, **kwargs
) -> FrameOrSeries:
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning).
See the docstring of `take` for full explanation of the parameters.
"""
result = self.take(indices=indices, axis=axis, **kwargs)
# Maybe set copy if we didn't actually change the index.
if not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
return result

def xs(self, key, axis=0, level=None, drop_level: bool_t = True):
Expand Down Expand Up @@ -3528,9 +3538,9 @@ class animal locomotion
if isinstance(loc, np.ndarray):
if loc.dtype == np.bool_:
(inds,) = loc.nonzero()
return self.take(inds, axis=axis)
return self._take_with_is_copy(inds, axis=axis)
else:
return self.take(loc, axis=axis)
return self._take_with_is_copy(loc, axis=axis)

if not is_scalar(loc):
new_index = self.index[loc]
Expand Down Expand Up @@ -3587,7 +3597,7 @@ def _iget_item_cache(self, item):
if ax.is_unique:
lower = self._get_item_cache(ax[item])
else:
lower = self.take(item, axis=self._info_axis_number)
lower = self._take_with_is_copy(item, axis=self._info_axis_number)
return lower

def _box_item_values(self, key, values):
Expand Down Expand Up @@ -7180,8 +7190,7 @@ def asof(self, where, subset=None):

# mask the missing
missing = locs == -1
d = self.take(locs)
data = d.copy()
data = self.take(locs)
data.index = where
data.loc[missing] = np.nan
return data if is_list else data.iloc[-1]
Expand Down Expand Up @@ -7727,7 +7736,7 @@ def at_time(
except AttributeError:
raise TypeError("Index must be DatetimeIndex")

return self.take(indexer, axis=axis)
return self._take_with_is_copy(indexer, axis=axis)

def between_time(
self: FrameOrSeries,
Expand Down Expand Up @@ -7809,7 +7818,7 @@ def between_time(
except AttributeError:
raise TypeError("Index must be DatetimeIndex")

return self.take(indexer, axis=axis)
return self._take_with_is_copy(indexer, axis=axis)

def resample(
self,
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,7 @@ def get_group(self, name, obj=None):
if not len(inds):
raise KeyError(name)

return obj.take(inds, axis=self.axis)
return obj._take_with_is_copy(inds, axis=self.axis)

def __iter__(self):
"""
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1588,7 +1588,7 @@ def _getitem_iterable(self, key, axis: int):
# A boolean indexer
key = check_bool_indexer(labels, key)
(inds,) = key.nonzero()
return self.obj.take(inds, axis=axis)
return self.obj._take_with_is_copy(inds, axis=axis)
else:
# A collection of keys
keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
Expand Down Expand Up @@ -1780,7 +1780,7 @@ def _getbool_axis(self, key, axis: int):
labels = self.obj._get_axis(axis)
key = check_bool_indexer(labels, key)
inds = key.nonzero()[0]
return self.obj.take(inds, axis=axis)
return self.obj._take_with_is_copy(inds, axis=axis)

def _get_slice_axis(self, slice_obj: slice, axis: int):
"""
Expand Down Expand Up @@ -2107,7 +2107,7 @@ def _get_list_axis(self, key, axis: int):
`axis` can only be zero.
"""
try:
return self.obj.take(key, axis=axis)
return self.obj._take_with_is_copy(key, axis=axis)
except IndexError:
# re-raise with different error message
raise IndexError("positional indexers are out-of-bounds")
Expand Down
25 changes: 18 additions & 7 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,7 +804,14 @@ def axes(self):
# Indexing Methods

@Appender(generic.NDFrame.take.__doc__)
def take(self, indices, axis=0, is_copy=False, **kwargs):
def take(self, indices, axis=0, is_copy=None, **kwargs) -> "Series":
if is_copy is not None:
warnings.warn(
"is_copy is deprecated and will be removed in a future version. "
"'take' always returns a copy, so there is no need to specify this.",
FutureWarning,
stacklevel=2,
)
nv.validate_take(tuple(), kwargs)

indices = ensure_platform_int(indices)
Expand All @@ -819,16 +826,20 @@ def take(self, indices, axis=0, is_copy=False, **kwargs):
kwargs = {}
new_values = self._values.take(indices, **kwargs)

result = self._constructor(
return self._constructor(
new_values, index=new_index, fastpath=True
).__finalize__(self)

# Maybe set copy if we didn't actually change the index.
if is_copy:
if not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
def _take_with_is_copy(self, indices, axis=0, **kwargs):
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning). For Series this does the same
as the public take (it never sets `_is_copy`).
return result
See the docstring of `take` for full explanation of the parameters.
"""
return self.take(indices=indices, axis=axis, **kwargs)

def _ixs(self, i: int, axis: int = 0):
"""
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/frame/methods/test_asof.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,16 @@ def test_time_zone_aware_index(self, stamp, expected):

result = df.asof(stamp)
tm.assert_series_equal(result, expected)

def test_is_copy(self, date_range_frame):
# GH-27357, GH-30784: ensure the result of asof is an actual copy and
# doesn't track the parent dataframe / doesn't give SettingWithCopy warnings
df = date_range_frame
N = 50
df.loc[15:30, "A"] = np.nan
dates = date_range("1/1/1990", periods=N * 3, freq="25s")

result = df.asof(dates)

with tm.assert_produces_warning(None):
result["C"] = 1
20 changes: 17 additions & 3 deletions pandas/tests/generic/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,15 @@ def test_sample_upsampling_without_replacement(self):
with pytest.raises(ValueError, match=msg):
df.sample(frac=2, replace=False)

def test_sample_is_copy(self):
# GH-27357, GH-30784: ensure the result of sample is an actual copy and
# doesn't track the parent dataframe / doesn't give SettingWithCopy warnings
df = pd.DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"])
df2 = df.sample(3)

with tm.assert_produces_warning(None):
df2["d"] = 1

def test_size_compat(self):
# GH8846
# size property should be defined
Expand Down Expand Up @@ -820,18 +829,23 @@ def test_take_invalid_kwargs(self):
with pytest.raises(ValueError, match=msg):
obj.take(indices, mode="clip")

def test_depr_take_kwarg_is_copy(self):
@pytest.mark.parametrize("is_copy", [True, False])
def test_depr_take_kwarg_is_copy(self, is_copy):
# GH 27357
df = DataFrame({"A": [1, 2, 3]})
msg = (
"is_copy is deprecated and will be removed in a future version. "
"take will always return a copy in the future."
"'take' always returns a copy, so there is no need to specify this."
)
with tm.assert_produces_warning(FutureWarning) as w:
df.take([0, 1], is_copy=True)
df.take([0, 1], is_copy=is_copy)

assert w[0].message.args[0] == msg

s = Series([1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
s.take([0, 1], is_copy=is_copy)

def test_equals(self):
s1 = pd.Series([1, 2, 3], index=[0, 2, 1])
s2 = s1.copy()
Expand Down

0 comments on commit 495abdf

Please sign in to comment.