Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

API: DataFrame.take always returns a copy #30784

Merged
merged 12 commits into from
Jan 27, 2020
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -723,7 +723,7 @@ Deprecations
- The deprecated internal attributes ``_start``, ``_stop`` and ``_step`` of :class:`RangeIndex` now raise a ``FutureWarning`` instead of a ``DeprecationWarning`` (:issue:`26581`)
- The ``pandas.util.testing`` module has been deprecated. Use the public API in ``pandas.testing`` documented at :ref:`api.general.testing` (:issue:`16232`).
- ``pandas.SparseArray`` has been deprecated. Use ``pandas.arrays.SparseArray`` (:class:`arrays.SparseArray`) instead. (:issue:`30642`)
- The parameter ``is_copy`` of :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`)
- The parameter ``is_copy`` of :meth:`Series.take` and :meth:`DataFrame.take` has been deprecated and will be removed in a future version. (:issue:`27357`)
- Support for multi-dimensional indexing (e.g. ``index[:, None]``) on a :class:`Index` is deprecated and will be removed in a future version, convert to a numpy array before indexing instead (:issue:`30588`)
- The ``pandas.np`` submodule is now deprecated. Import numpy directly instead (:issue:`30296`)
- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30610`)
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2797,7 +2797,7 @@ def __getitem__(self, key):
if getattr(indexer, "dtype", None) == bool:
indexer = np.where(indexer)[0]

data = self.take(indexer, axis=1)
data = self._take_with_is_copy(indexer, axis=1)

if is_single_key:
# What does looking for a single key in a non-unique index return?
Expand Down Expand Up @@ -2830,7 +2830,7 @@ def _getitem_bool_array(self, key):
# be reindexed to match DataFrame rows
key = check_bool_indexer(self.index, key)
indexer = key.nonzero()[0]
return self.take(indexer, axis=0)
return self._take_with_is_copy(indexer, axis=0)

def _getitem_multilevel(self, key):
# self.columns is a MultiIndex
Expand Down
35 changes: 20 additions & 15 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3287,12 +3287,10 @@ class max_speed
if is_copy is not None:
warnings.warn(
"is_copy is deprecated and will be removed in a future version. "
"take will always return a copy in the future.",
"'take' always returns a copy, so there is no need to specify this.",
FutureWarning,
stacklevel=2,
)
else:
is_copy = True

nv.validate_take(tuple(), kwargs)

Expand All @@ -3301,13 +3299,21 @@ class max_speed
new_data = self._data.take(
indices, axis=self._get_block_manager_axis(axis), verify=True
)
result = self._constructor(new_data).__finalize__(self)
return self._constructor(new_data).__finalize__(self)

# Maybe set copy if we didn't actually change the index.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure if its relevant, but DTI.take sometimes returns a view

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the case the indices can represent a slice, right?
Personally I don't think we should make that optimization, as IMO it is much clearer for take to always be a copy, but I suppose for Index which is supposed to be immutable it might matter less.

if is_copy:
if not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
def _take_with_is_copy(
self: FrameOrSeries, indices, axis=0, **kwargs
) -> FrameOrSeries:
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning).

See the docstring of `take` for full explanation of the parameters.
"""
result = self.take(indices=indices, axis=axis, **kwargs)
if not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
return result

def xs(self, key, axis=0, level=None, drop_level: bool_t = True):
Expand Down Expand Up @@ -3437,9 +3443,9 @@ class animal locomotion
if isinstance(loc, np.ndarray):
if loc.dtype == np.bool_:
(inds,) = loc.nonzero()
return self.take(inds, axis=axis)
return self._take_with_is_copy(inds, axis=axis)
else:
return self.take(loc, axis=axis)
return self._take_with_is_copy(loc, axis=axis)

if not is_scalar(loc):
new_index = self.index[loc]
Expand Down Expand Up @@ -3496,7 +3502,7 @@ def _iget_item_cache(self, item):
if ax.is_unique:
lower = self._get_item_cache(ax[item])
else:
lower = self.take(item, axis=self._info_axis_number)
lower = self._take_with_is_copy(item, axis=self._info_axis_number)
return lower

def _box_item_values(self, key, values):
Expand Down Expand Up @@ -6876,8 +6882,7 @@ def asof(self, where, subset=None):

# mask the missing
missing = locs == -1
d = self.take(locs)
data = d.copy()
data = self.take(locs)
data.index = where
data.loc[missing] = np.nan
return data if is_list else data.iloc[-1]
Expand Down Expand Up @@ -7423,7 +7428,7 @@ def at_time(
except AttributeError:
raise TypeError("Index must be DatetimeIndex")

return self.take(indexer, axis=axis)
return self._take_with_is_copy(indexer, axis=axis)

def between_time(
self: FrameOrSeries,
Expand Down Expand Up @@ -7505,7 +7510,7 @@ def between_time(
except AttributeError:
raise TypeError("Index must be DatetimeIndex")

return self.take(indexer, axis=axis)
return self._take_with_is_copy(indexer, axis=axis)

def resample(
self,
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,7 @@ def get_group(self, name, obj=None):
if not len(inds):
raise KeyError(name)

return obj.take(inds, axis=self.axis)
return obj._take_with_is_copy(inds, axis=self.axis)

def __iter__(self):
"""
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,7 @@ def get_value(self, series, key):

if isinstance(key, time):
locs = self.indexer_at_time(key)
return series.take(locs)
return series._take_with_is_copy(locs)

if isinstance(key, str):
try:
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1509,7 +1509,7 @@ def _getitem_iterable(self, key, axis: int):
# A boolean indexer
key = check_bool_indexer(labels, key)
(inds,) = key.nonzero()
return self.obj.take(inds, axis=axis)
return self.obj._take_with_is_copy(inds, axis=axis)
else:
# A collection of keys
keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
Expand Down Expand Up @@ -1701,7 +1701,7 @@ def _getbool_axis(self, key, axis: int):
labels = self.obj._get_axis(axis)
key = check_bool_indexer(labels, key)
inds = key.nonzero()[0]
return self.obj.take(inds, axis=axis)
return self.obj._take_with_is_copy(inds, axis=axis)

def _get_slice_axis(self, slice_obj: slice, axis: int):
"""
Expand All @@ -1722,7 +1722,7 @@ def _get_slice_axis(self, slice_obj: slice, axis: int):
else:
# DatetimeIndex overrides Index.slice_indexer and may
# return a DatetimeIndex instead of a slice object.
return self.obj.take(indexer, axis=axis)
return self.obj._take_with_is_copy(indexer, axis=axis)


@Appender(IndexingMixin.loc.__doc__)
Expand Down Expand Up @@ -2017,7 +2017,7 @@ def _get_list_axis(self, key, axis: int):
`axis` can only be zero.
"""
try:
return self.obj.take(key, axis=axis)
return self.obj._take_with_is_copy(key, axis=axis)
except IndexError:
# re-raise with different error message
raise IndexError("positional indexers are out-of-bounds")
Expand Down
25 changes: 18 additions & 7 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,14 @@ def axes(self) -> List[Index]:
# Indexing Methods

@Appender(generic.NDFrame.take.__doc__)
def take(self, indices, axis=0, is_copy=False, **kwargs) -> "Series":
def take(self, indices, axis=0, is_copy=None, **kwargs) -> "Series":
if is_copy is not None:
warnings.warn(
"is_copy is deprecated and will be removed in a future version. "
"'take' always returns a copy, so there is no need to specify this.",
FutureWarning,
stacklevel=2,
)
nv.validate_take(tuple(), kwargs)

indices = ensure_platform_int(indices)
Expand All @@ -771,16 +778,20 @@ def take(self, indices, axis=0, is_copy=False, **kwargs) -> "Series":
kwargs = {}
new_values = self._values.take(indices, **kwargs)

result = self._constructor(
return self._constructor(
new_values, index=new_index, fastpath=True
).__finalize__(self)

# Maybe set copy if we didn't actually change the index.
if is_copy:
if not result._get_axis(axis).equals(self._get_axis(axis)):
result._set_is_copy(self)
def _take_with_is_copy(self, indices, axis=0, **kwargs):
"""
Internal version of the `take` method that sets the `_is_copy`
attribute to keep track of the parent dataframe (using in indexing
for the SettingWithCopyWarning). For Series this does the same
as the public take (it never sets `_is_copy`).

return result
See the docstring of `take` for full explanation of the parameters.
"""
return self.take(indices=indices, axis=axis, **kwargs)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am including a Series._take_with_is_copy to override the generic one. This is because the defaults for the old is_copy kwargs were different for Series (False) and DataFrame (True).

I am not fully sure if the code paths that use _take_with_is_copy actually can have a Series calling it, but included this to be sure.


def _ixs(self, i: int, axis: int = 0):
"""
Expand Down
11 changes: 8 additions & 3 deletions pandas/tests/generic/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -817,18 +817,23 @@ def test_take_invalid_kwargs(self):
with pytest.raises(ValueError, match=msg):
obj.take(indices, mode="clip")

def test_depr_take_kwarg_is_copy(self):
@pytest.mark.parametrize("is_copy", [True, False])
def test_depr_take_kwarg_is_copy(self, is_copy):
# GH 27357
df = DataFrame({"A": [1, 2, 3]})
msg = (
"is_copy is deprecated and will be removed in a future version. "
"take will always return a copy in the future."
"'take' always returns a copy, so there is no need to specify this."
)
with tm.assert_produces_warning(FutureWarning) as w:
df.take([0, 1], is_copy=True)
df.take([0, 1], is_copy=is_copy)

assert w[0].message.args[0] == msg

s = Series([1, 2, 3])
with tm.assert_produces_warning(FutureWarning):
s.take([0, 1], is_copy=is_copy)

def test_equals(self):
s1 = pd.Series([1, 2, 3], index=[0, 2, 1])
s2 = s1.copy()
Expand Down