Skip to content

Commit

Permalink
BUG: DataFrame.__setitem__ sometimes operating inplace (#43406)
Browse files Browse the repository at this point in the history
  • Loading branch information
jbrockmendel authored Nov 1, 2021
1 parent 33fffdb commit 03dd698
Show file tree
Hide file tree
Showing 14 changed files with 115 additions and 63 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,7 @@ Indexing
- Bug in :meth:`DataFrame.query` where method calls in query strings led to errors when the ``numexpr`` package was installed. (:issue:`22435`)
- Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`)
- Bug in indexing on a non-unique object-dtype :class:`Index` with an NA scalar (e.g. ``np.nan``) (:issue:`43711`)
- Bug in :meth:`DataFrame.__setitem__` incorrectly writing into an existing column's array rather than setting a new array when the new dtype and the old dtype match (:issue:`43406`)
- Bug in :meth:`Series.__setitem__` with object dtype when setting an array with matching size and dtype='datetime64[ns]' or dtype='timedelta64[ns]' incorrectly converting the datetime/timedeltas to integers (:issue:`43868`)
- Bug in :meth:`DataFrame.sort_index` where ``ignore_index=True`` was not being respected when the index was already sorted (:issue:`43591`)
- Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.datetime64("NaT")`` and ``np.timedelta64("NaT")`` (:issue:`43869`)
Expand Down
14 changes: 8 additions & 6 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3840,9 +3840,11 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None:
arraylike = _reindex_for_setitem(value, self.index)
self._set_item_mgr(key, arraylike)

def _iset_item_mgr(self, loc: int | slice | np.ndarray, value) -> None:
def _iset_item_mgr(
self, loc: int | slice | np.ndarray, value, inplace: bool = False
) -> None:
# when called from _set_item_mgr loc can be anything returned from get_loc
self._mgr.iset(loc, value)
self._mgr.iset(loc, value, inplace=inplace)
self._clear_item_cache()

def _set_item_mgr(self, key, value: ArrayLike) -> None:
Expand All @@ -3860,9 +3862,9 @@ def _set_item_mgr(self, key, value: ArrayLike) -> None:
if len(self):
self._check_setitem_copy()

def _iset_item(self, loc: int, value) -> None:
def _iset_item(self, loc: int, value, inplace: bool = False) -> None:
arraylike = self._sanitize_column(value)
self._iset_item_mgr(loc, arraylike)
self._iset_item_mgr(loc, arraylike, inplace=inplace)

# check if we are modifying a copy
# try to set first as we want an invalid
Expand Down Expand Up @@ -3994,13 +3996,13 @@ def _reset_cacher(self) -> None:
# no-op for DataFrame
pass

def _maybe_cache_changed(self, item, value: Series) -> None:
def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None:
"""
The object has called back to us saying maybe it has changed.
"""
loc = self._info_axis.get_loc(item)
arraylike = value._values
self._mgr.iset(loc, arraylike)
self._mgr.iset(loc, arraylike, inplace=inplace)

# ----------------------------------------------------------------------
# Unsorted
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3531,7 +3531,10 @@ def _reset_cacher(self) -> None:
raise AbstractMethodError(self)

def _maybe_update_cacher(
self, clear: bool_t = False, verify_is_copy: bool_t = True
self,
clear: bool_t = False,
verify_is_copy: bool_t = True,
inplace: bool_t = False,
) -> None:
"""
See if we need to update our parent cacher if clear, then clear our
Expand Down
13 changes: 7 additions & 6 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1862,10 +1862,10 @@ def _setitem_single_column(self, loc: int, value, plane_indexer):
# set the item, possibly having a dtype change
ser = ser.copy()
ser._mgr = ser._mgr.setitem(indexer=(pi,), value=value)
ser._maybe_update_cacher(clear=True)
ser._maybe_update_cacher(clear=True, inplace=True)

# reset the sliced object if unique
self.obj._iset_item(loc, ser)
self.obj._iset_item(loc, ser, inplace=True)

def _setitem_single_block(self, indexer, value, name: str):
"""
Expand All @@ -1890,9 +1890,10 @@ def _setitem_single_block(self, indexer, value, name: str):
if i != info_axis
)
):
selected_item_labels = item_labels[indexer[info_axis]]
if len(item_labels.get_indexer_for([selected_item_labels])) == 1:
self.obj[selected_item_labels] = value
col = item_labels[indexer[info_axis]]
if len(item_labels.get_indexer_for([col])) == 1:
loc = item_labels.get_loc(col)
self.obj._iset_item(loc, value, inplace=True)
return

indexer = maybe_convert_ix(*indexer)
Expand All @@ -1910,7 +1911,7 @@ def _setitem_single_block(self, indexer, value, name: str):

# actually do the set
self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value)
self.obj._maybe_update_cacher(clear=True)
self.obj._maybe_update_cacher(clear=True, inplace=True)

def _setitem_with_indexer_missing(self, indexer, value):
"""
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,9 @@ def column_arrays(self) -> list[ArrayLike]:
"""
return self.arrays

def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
def iset(
self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
):
"""
Set new column(s).
Expand All @@ -804,6 +806,8 @@ def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
loc : integer, slice or boolean mask
Positional location (already bounds checked)
value : np.ndarray or ExtensionArray
inplace : bool, default False
Whether overwrite existing array as opposed to replacing it.
"""
# single column -> single integer index
if lib.is_integer(loc):
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1032,7 +1032,9 @@ def column_arrays(self) -> list[np.ndarray]:
# expected "List[ndarray[Any, Any]]")
return result # type: ignore[return-value]

def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
def iset(
self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
):
"""
Set new item in-place. Does not consolidate. Adds new Block if not
contained in the current set of items
Expand Down Expand Up @@ -1087,7 +1089,7 @@ def value_getitem(placement):
for blkno, val_locs in libinternals.get_blkno_placements(blknos, group=True):
blk = self.blocks[blkno]
blk_locs = blklocs[val_locs.indexer]
if blk.should_store(value):
if inplace and blk.should_store(value):
blk.set_inplace(blk_locs, value_getitem(val_locs))
else:
unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1226,7 +1226,7 @@ def _check_is_chained_assignment_possible(self) -> bool:
return super()._check_is_chained_assignment_possible()

def _maybe_update_cacher(
self, clear: bool = False, verify_is_copy: bool = True
self, clear: bool = False, verify_is_copy: bool = True, inplace: bool = False
) -> None:
"""
See NDFrame._maybe_update_cacher.__doc__
Expand All @@ -1244,13 +1244,15 @@ def _maybe_update_cacher(
# GH#42530 self.name must be in ref.columns
# to ensure column still in dataframe
# otherwise, either self or ref has swapped in new arrays
ref._maybe_cache_changed(cacher[0], self)
ref._maybe_cache_changed(cacher[0], self, inplace=inplace)
else:
# GH#33675 we have swapped in a new array, so parent
# reference to self is now invalid
ref._item_cache.pop(cacher[0], None)

super()._maybe_update_cacher(clear=clear, verify_is_copy=verify_is_copy)
super()._maybe_update_cacher(
clear=clear, verify_is_copy=verify_is_copy, inplace=inplace
)

# ----------------------------------------------------------------------
# Unsorted
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,16 +537,19 @@ def test_getitem_setitem_integer_slice_keyerrors(self):

@td.skip_array_manager_invalid_test # already covered in test_iloc_col_slice_view
def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame):

sliced = float_string_frame.iloc[:, -3:]
assert sliced["D"].dtype == np.float64

# get view with single block
# setting it triggers setting with copy
sliced = float_frame.iloc[:, -3:]

assert np.shares_memory(sliced["C"]._values, float_frame["C"]._values)

msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
sliced["C"] = 4.0
sliced.loc[:, "C"] = 4.0

assert (float_frame["C"] == 4).all()

Expand Down Expand Up @@ -1002,9 +1005,11 @@ def test_iloc_row_slice_view(self, using_array_manager):
# setting it makes it raise/warn
subset = df.iloc[slice(4, 8)]

assert np.shares_memory(df[2], subset[2])

msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
subset[2] = 0.0
subset.loc[:, 2] = 0.0

exp_col = original[2].copy()
# TODO(ArrayManager) verify it is expected that the original didn't change
Expand Down Expand Up @@ -1041,10 +1046,13 @@ def test_iloc_col_slice_view(self, using_array_manager):

if not using_array_manager:
# verify slice is view

assert np.shares_memory(df[8]._values, subset[8]._values)

# and that we are setting a copy
msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
subset[8] = 0.0
subset.loc[:, 8] = 0.0

assert (df[8] == 0).all()
else:
Expand Down
37 changes: 31 additions & 6 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,12 +1028,6 @@ def test_setitem_duplicate_columns_not_inplace(self):
)
def test_setitem_same_dtype_not_inplace(self, value, using_array_manager, request):
# GH#39510
if not using_array_manager:
mark = pytest.mark.xfail(
reason="Setitem with same dtype still changing inplace"
)
request.node.add_marker(mark)

cols = ["A", "B"]
df = DataFrame(0, index=[0, 1], columns=cols)
df_copy = df.copy()
Expand All @@ -1056,3 +1050,34 @@ def test_setitem_listlike_key_scalar_value_not_inplace(self, value):
expected = DataFrame([[0, 1.0], [0, 1.0]], columns=cols)
tm.assert_frame_equal(df_view, df_copy)
tm.assert_frame_equal(df, expected)

@pytest.mark.parametrize(
"indexer",
[
"a",
["a"],
pytest.param(
[True, False],
marks=pytest.mark.xfail(
reason="Boolean indexer incorrectly setting inplace",
strict=False, # passing on some builds, no obvious pattern
),
),
],
)
@pytest.mark.parametrize(
"value, set_value",
[
(1, 5),
(1.0, 5.0),
(Timestamp("2020-12-31"), Timestamp("2021-12-31")),
("a", "b"),
],
)
def test_setitem_not_operating_inplace(self, value, set_value, indexer):
# GH#43406
df = DataFrame({"a": value}, index=[0, 1])
expected = df.copy()
view = df[:]
df[indexer] = set_value
tm.assert_frame_equal(view, expected)
5 changes: 4 additions & 1 deletion pandas/tests/frame/methods/test_rename.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,10 @@ def test_rename_multiindex(self):
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) setitem copy/view
def test_rename_nocopy(self, float_frame):
renamed = float_frame.rename(columns={"C": "foo"}, copy=False)
renamed["foo"] = 1.0

assert np.shares_memory(renamed["foo"]._values, float_frame["C"]._values)

renamed.loc[:, "foo"] = 1.0
assert (float_frame["C"] == 1.0).all()

def test_rename_inplace(self, float_frame):
Expand Down
20 changes: 12 additions & 8 deletions pandas/tests/indexing/test_iloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -824,20 +824,24 @@ def test_iloc_empty_list_indexer_is_ok(self):
df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True
)

def test_identity_slice_returns_new_object(self, using_array_manager):
def test_identity_slice_returns_new_object(self, using_array_manager, request):
# GH13873
if using_array_manager:
mark = pytest.mark.xfail(
reason="setting with .loc[:, 'a'] does not alter inplace"
)
request.node.add_marker(mark)

original_df = DataFrame({"a": [1, 2, 3]})
sliced_df = original_df.iloc[:]
assert sliced_df is not original_df

# should be a shallow copy
original_df["a"] = [4, 4, 4]
if using_array_manager:
# TODO(ArrayManager) verify it is expected that the original didn't change
# setitem is replacing full column, so doesn't update "viewing" dataframe
assert not (sliced_df["a"] == 4).all()
else:
assert (sliced_df["a"] == 4).all()
assert np.shares_memory(original_df["a"], sliced_df["a"])

# Setting using .loc[:, "a"] sets inplace so alters both sliced and orig
original_df.loc[:, "a"] = [4, 4, 4]
assert (sliced_df["a"] == 4).all()

original_series = Series([1, 2, 3, 4, 5, 6])
sliced_series = original_series.iloc[:]
Expand Down
20 changes: 12 additions & 8 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -1002,21 +1002,25 @@ def test_loc_empty_list_indexer_is_ok(self):
df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True
)

def test_identity_slice_returns_new_object(self, using_array_manager):
def test_identity_slice_returns_new_object(self, using_array_manager, request):
# GH13873
if using_array_manager:
mark = pytest.mark.xfail(
reason="setting with .loc[:, 'a'] does not alter inplace"
)
request.node.add_marker(mark)

original_df = DataFrame({"a": [1, 2, 3]})
sliced_df = original_df.loc[:]
assert sliced_df is not original_df
assert original_df[:] is not original_df

# should be a shallow copy
original_df["a"] = [4, 4, 4]
if using_array_manager:
# TODO(ArrayManager) verify it is expected that the original didn't change
# setitem is replacing full column, so doesn't update "viewing" dataframe
assert not (sliced_df["a"] == 4).all()
else:
assert (sliced_df["a"] == 4).all()
assert np.shares_memory(original_df["a"]._values, sliced_df["a"]._values)

# Setting using .loc[:, "a"] sets inplace so alters both sliced and orig
original_df.loc[:, "a"] = [4, 4, 4]
assert (sliced_df["a"] == 4).all()

# These should not return copies
assert original_df is original_df.loc[:, :]
Expand Down
12 changes: 9 additions & 3 deletions pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -750,7 +750,11 @@ def test_get_numeric_data(self):
)

# Check sharing
numeric.iset(numeric.items.get_loc("float"), np.array([100.0, 200.0, 300.0]))
numeric.iset(
numeric.items.get_loc("float"),
np.array([100.0, 200.0, 300.0]),
inplace=True,
)
tm.assert_almost_equal(
mgr.iget(mgr.items.get_loc("float")).internal_values(),
np.array([100.0, 200.0, 300.0]),
Expand All @@ -759,7 +763,9 @@ def test_get_numeric_data(self):
numeric2 = mgr.get_numeric_data(copy=True)
tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"]))
numeric2.iset(
numeric2.items.get_loc("float"), np.array([1000.0, 2000.0, 3000.0])
numeric2.items.get_loc("float"),
np.array([1000.0, 2000.0, 3000.0]),
inplace=True,
)
tm.assert_almost_equal(
mgr.iget(mgr.items.get_loc("float")).internal_values(),
Expand All @@ -781,7 +787,7 @@ def test_get_bool_data(self):
bools.iget(bools.items.get_loc("bool")).internal_values(),
)

bools.iset(0, np.array([True, False, True]))
bools.iset(0, np.array([True, False, True]), inplace=True)
tm.assert_numpy_array_equal(
mgr.iget(mgr.items.get_loc("bool")).internal_values(),
np.array([True, False, True]),
Expand Down
17 changes: 2 additions & 15 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,21 +308,8 @@ def test_merge_nocopy(self, using_array_manager):

merged = merge(left, right, left_index=True, right_index=True, copy=False)

if using_array_manager:
# With ArrayManager, setting a column doesn't change the values inplace
# and thus does not propagate the changes to the original left/right
# dataframes -> need to check that no copy was made in a different way
# TODO(ArrayManager) we should be able to simplify this with a .loc
# setitem test: merged.loc[0, "a"] = 10; assert left.loc[0, "a"] == 10
# but this currently replaces the array (_setitem_with_indexer_split_path)
assert merged._mgr.arrays[0] is left._mgr.arrays[0]
assert merged._mgr.arrays[2] is right._mgr.arrays[0]
else:
merged["a"] = 6
assert (left["a"] == 6).all()

merged["d"] = "peekaboo"
assert (right["d"] == "peekaboo").all()
assert np.shares_memory(merged["a"]._values, left["a"]._values)
assert np.shares_memory(merged["d"]._values, right["d"]._values)

def test_intelligently_handle_join_key(self):
# #733, be a bit more 1337 about not returning unconsolidated DataFrame
Expand Down

0 comments on commit 03dd698

Please sign in to comment.