Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: DataFrame.__setitem__ sometimes operating inplace #43406

Merged
merged 11 commits into from
Nov 1, 2021
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,7 @@ Indexing
- Bug in :meth:`DataFrame.query` where method calls in query strings led to errors when the ``numexpr`` package was installed. (:issue:`22435`)
- Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`)
- Bug in indexing on a non-unique object-dtype :class:`Index` with an NA scalar (e.g. ``np.nan``) (:issue:`43711`)
- Bug in :meth:`DataFrame.__setitem__` incorrectly writing into an existing column's array rather than setting a new array when the new dtype and the old dtype match (:issue:`43406`)
- Bug in :meth:`Series.__setitem__` with object dtype when setting an array with matching size and dtype='datetime64[ns]' or dtype='timedelta64[ns]' incorrectly converting the datetime/timedeltas to integers (:issue:`43868`)
-

Expand Down
14 changes: 8 additions & 6 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3841,9 +3841,11 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None:
arraylike = _reindex_for_setitem(value, self.index)
self._set_item_mgr(key, arraylike)

def _iset_item_mgr(self, loc: int | slice | np.ndarray, value) -> None:
def _iset_item_mgr(
self, loc: int | slice | np.ndarray, value, inplace: bool = False
) -> None:
# when called from _set_item_mgr loc can be anything returned from get_loc
self._mgr.iset(loc, value)
self._mgr.iset(loc, value, inplace=inplace)
self._clear_item_cache()

def _set_item_mgr(self, key, value: ArrayLike) -> None:
Expand All @@ -3861,9 +3863,9 @@ def _set_item_mgr(self, key, value: ArrayLike) -> None:
if len(self):
self._check_setitem_copy()

def _iset_item(self, loc: int, value) -> None:
def _iset_item(self, loc: int, value, inplace: bool = False) -> None:
arraylike = self._sanitize_column(value)
self._iset_item_mgr(loc, arraylike)
self._iset_item_mgr(loc, arraylike, inplace=inplace)

# check if we are modifying a copy
# try to set first as we want an invalid
Expand Down Expand Up @@ -3995,13 +3997,13 @@ def _reset_cacher(self) -> None:
# no-op for DataFrame
pass

def _maybe_cache_changed(self, item, value: Series) -> None:
def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None:
"""
The object has called back to us saying maybe it has changed.
"""
loc = self._info_axis.get_loc(item)
arraylike = value._values
self._mgr.iset(loc, arraylike)
self._mgr.iset(loc, arraylike, inplace=inplace)

# ----------------------------------------------------------------------
# Unsorted
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -3530,7 +3530,10 @@ def _reset_cacher(self) -> None:
raise AbstractMethodError(self)

def _maybe_update_cacher(
self, clear: bool_t = False, verify_is_copy: bool_t = True
self,
clear: bool_t = False,
verify_is_copy: bool_t = True,
inplace: bool_t = False,
) -> None:
"""
See if we need to update our parent cacher if clear, then clear our
Expand Down
13 changes: 7 additions & 6 deletions pandas/core/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1862,10 +1862,10 @@ def _setitem_single_column(self, loc: int, value, plane_indexer):
# set the item, possibly having a dtype change
ser = ser.copy()
ser._mgr = ser._mgr.setitem(indexer=(pi,), value=value)
ser._maybe_update_cacher(clear=True)
ser._maybe_update_cacher(clear=True, inplace=True)

# reset the sliced object if unique
self.obj._iset_item(loc, ser)
self.obj._iset_item(loc, ser, inplace=True)

def _setitem_single_block(self, indexer, value, name: str):
"""
Expand All @@ -1890,9 +1890,10 @@ def _setitem_single_block(self, indexer, value, name: str):
if i != info_axis
)
):
selected_item_labels = item_labels[indexer[info_axis]]
if len(item_labels.get_indexer_for([selected_item_labels])) == 1:
self.obj[selected_item_labels] = value
col = item_labels[indexer[info_axis]]
if len(item_labels.get_indexer_for([col])) == 1:
loc = item_labels.get_loc(col)
self.obj._iset_item(loc, value, inplace=True)
return

indexer = maybe_convert_ix(*indexer)
Expand All @@ -1910,7 +1911,7 @@ def _setitem_single_block(self, indexer, value, name: str):

# actually do the set
self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value)
self.obj._maybe_update_cacher(clear=True)
self.obj._maybe_update_cacher(clear=True, inplace=True)

def _setitem_with_indexer_missing(self, indexer, value):
"""
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -790,7 +790,9 @@ def column_arrays(self) -> list[ArrayLike]:
"""
return self.arrays

def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
def iset(
self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
):
"""
Set new column(s).

Expand All @@ -802,6 +804,8 @@ def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
loc : integer, slice or boolean mask
Positional location (already bounds checked)
value : np.ndarray or ExtensionArray
inplace : bool, default False
Whether overwrite existing array as opposed to replacing it.
"""
# single column -> single integer index
if lib.is_integer(loc):
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1024,7 +1024,9 @@ def column_arrays(self) -> list[np.ndarray]:
# expected "List[ndarray[Any, Any]]")
return result # type: ignore[return-value]

def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
def iset(
self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
):
"""
Set new item in-place. Does not consolidate. Adds new Block if not
contained in the current set of items
Expand Down Expand Up @@ -1079,7 +1081,7 @@ def value_getitem(placement):
for blkno, val_locs in libinternals.get_blkno_placements(blknos, group=True):
blk = self.blocks[blkno]
blk_locs = blklocs[val_locs.indexer]
if blk.should_store(value):
if inplace and blk.should_store(value):
blk.set_inplace(blk_locs, value_getitem(val_locs))
else:
unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])
Expand Down
8 changes: 5 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1226,7 +1226,7 @@ def _check_is_chained_assignment_possible(self) -> bool:
return super()._check_is_chained_assignment_possible()

def _maybe_update_cacher(
self, clear: bool = False, verify_is_copy: bool = True
self, clear: bool = False, verify_is_copy: bool = True, inplace: bool = False
jreback marked this conversation as resolved.
Show resolved Hide resolved
) -> None:
"""
See NDFrame._maybe_update_cacher.__doc__
Expand All @@ -1244,13 +1244,15 @@ def _maybe_update_cacher(
# GH#42530 self.name must be in ref.columns
# to ensure column still in dataframe
# otherwise, either self or ref has swapped in new arrays
ref._maybe_cache_changed(cacher[0], self)
ref._maybe_cache_changed(cacher[0], self, inplace=inplace)
else:
# GH#33675 we have swapped in a new array, so parent
# reference to self is now invalid
ref._item_cache.pop(cacher[0], None)

super()._maybe_update_cacher(clear=clear, verify_is_copy=verify_is_copy)
super()._maybe_update_cacher(
clear=clear, verify_is_copy=verify_is_copy, inplace=inplace
)

# ----------------------------------------------------------------------
# Unsorted
Expand Down
14 changes: 11 additions & 3 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,16 +537,19 @@ def test_getitem_setitem_integer_slice_keyerrors(self):

@td.skip_array_manager_invalid_test # already covered in test_iloc_col_slice_view
def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame):

sliced = float_string_frame.iloc[:, -3:]
assert sliced["D"].dtype == np.float64

# get view with single block
# setting it triggers setting with copy
sliced = float_frame.iloc[:, -3:]

assert np.shares_memory(sliced["C"]._values, float_frame["C"]._values)

msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
sliced["C"] = 4.0
sliced.loc[:, "C"] = 4.0

assert (float_frame["C"] == 4).all()

Expand Down Expand Up @@ -1004,9 +1007,11 @@ def test_iloc_row_slice_view(self, using_array_manager):
# setting it makes it raise/warn
subset = df.iloc[slice(4, 8)]

assert np.shares_memory(df[2], subset[2])

msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
subset[2] = 0.0
subset.loc[:, 2] = 0.0

exp_col = original[2].copy()
# TODO(ArrayManager) verify it is expected that the original didn't change
Expand Down Expand Up @@ -1043,10 +1048,13 @@ def test_iloc_col_slice_view(self, using_array_manager):

if not using_array_manager:
# verify slice is view

assert np.shares_memory(df[8]._values, subset[8]._values)

# and that we are setting a copy
msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
with pytest.raises(com.SettingWithCopyError, match=msg):
subset[8] = 0.0
subset.loc[:, 8] = 0.0

assert (df[8] == 0).all()
else:
Expand Down
37 changes: 31 additions & 6 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,12 +1028,6 @@ def test_setitem_duplicate_columns_not_inplace(self):
)
def test_setitem_same_dtype_not_inplace(self, value, using_array_manager, request):
# GH#39510
if not using_array_manager:
mark = pytest.mark.xfail(
reason="Setitem with same dtype still changing inplace"
)
request.node.add_marker(mark)

cols = ["A", "B"]
df = DataFrame(0, index=[0, 1], columns=cols)
df_copy = df.copy()
Expand All @@ -1056,3 +1050,34 @@ def test_setitem_listlike_key_scalar_value_not_inplace(self, value):
expected = DataFrame([[0, 1.0], [0, 1.0]], columns=cols)
tm.assert_frame_equal(df_view, df_copy)
tm.assert_frame_equal(df, expected)

@pytest.mark.parametrize(
"indexer",
[
"a",
["a"],
pytest.param(
[True, False],
marks=pytest.mark.xfail(
reason="Boolean indexer incorrectly setting inplace",
strict=False, # passing on some builds, no obvious pattern
),
),
],
)
@pytest.mark.parametrize(
"value, set_value",
[
(1, 5),
(1.0, 5.0),
(Timestamp("2020-12-31"), Timestamp("2021-12-31")),
("a", "b"),
],
)
def test_setitem_not_operating_inplace(self, value, set_value, indexer):
# GH#43406
df = DataFrame({"a": value}, index=[0, 1])
expected = df.copy()
view = df[:]
df[indexer] = set_value
tm.assert_frame_equal(view, expected)
5 changes: 4 additions & 1 deletion pandas/tests/frame/methods/test_rename.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,10 @@ def test_rename_multiindex(self):
@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) setitem copy/view
def test_rename_nocopy(self, float_frame):
renamed = float_frame.rename(columns={"C": "foo"}, copy=False)
renamed["foo"] = 1.0

assert np.shares_memory(renamed["foo"]._values, float_frame["C"]._values)

renamed.loc[:, "foo"] = 1.0
assert (float_frame["C"] == 1.0).all()

def test_rename_inplace(self, float_frame):
Expand Down
20 changes: 12 additions & 8 deletions pandas/tests/indexing/test_iloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -826,20 +826,24 @@ def test_iloc_empty_list_indexer_is_ok(self):
df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True
)

def test_identity_slice_returns_new_object(self, using_array_manager):
def test_identity_slice_returns_new_object(self, using_array_manager, request):
# GH13873
if using_array_manager:
mark = pytest.mark.xfail(
reason="setting with .loc[:, 'a'] does not alter inplace"
)
request.node.add_marker(mark)

original_df = DataFrame({"a": [1, 2, 3]})
sliced_df = original_df.iloc[:]
assert sliced_df is not original_df

# should be a shallow copy
original_df["a"] = [4, 4, 4]
if using_array_manager:
# TODO(ArrayManager) verify it is expected that the original didn't change
# setitem is replacing full column, so doesn't update "viewing" dataframe
assert not (sliced_df["a"] == 4).all()
else:
assert (sliced_df["a"] == 4).all()
assert np.shares_memory(original_df["a"], sliced_df["a"])

# Setting using .loc[:, "a"] sets inplace so alters both sliced and orig
original_df.loc[:, "a"] = [4, 4, 4]
assert (sliced_df["a"] == 4).all()

original_series = Series([1, 2, 3, 4, 5, 6])
sliced_series = original_series.iloc[:]
Expand Down
20 changes: 12 additions & 8 deletions pandas/tests/indexing/test_loc.py
Original file line number Diff line number Diff line change
Expand Up @@ -998,21 +998,25 @@ def test_loc_empty_list_indexer_is_ok(self):
df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True
)

def test_identity_slice_returns_new_object(self, using_array_manager):
def test_identity_slice_returns_new_object(self, using_array_manager, request):
# GH13873
if using_array_manager:
mark = pytest.mark.xfail(
reason="setting with .loc[:, 'a'] does not alter inplace"
)
request.node.add_marker(mark)

original_df = DataFrame({"a": [1, 2, 3]})
sliced_df = original_df.loc[:]
assert sliced_df is not original_df
assert original_df[:] is not original_df

# should be a shallow copy
original_df["a"] = [4, 4, 4]
if using_array_manager:
# TODO(ArrayManager) verify it is expected that the original didn't change
# setitem is replacing full column, so doesn't update "viewing" dataframe
assert not (sliced_df["a"] == 4).all()
else:
assert (sliced_df["a"] == 4).all()
assert np.shares_memory(original_df["a"]._values, sliced_df["a"]._values)

# Setting using .loc[:, "a"] sets inplace so alters both sliced and orig
original_df.loc[:, "a"] = [4, 4, 4]
assert (sliced_df["a"] == 4).all()

# These should not return copies
assert original_df is original_df.loc[:, :]
Expand Down
12 changes: 9 additions & 3 deletions pandas/tests/internals/test_internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -750,7 +750,11 @@ def test_get_numeric_data(self):
)

# Check sharing
numeric.iset(numeric.items.get_loc("float"), np.array([100.0, 200.0, 300.0]))
numeric.iset(
numeric.items.get_loc("float"),
np.array([100.0, 200.0, 300.0]),
inplace=True,
)
tm.assert_almost_equal(
mgr.iget(mgr.items.get_loc("float")).internal_values(),
np.array([100.0, 200.0, 300.0]),
Expand All @@ -759,7 +763,9 @@ def test_get_numeric_data(self):
numeric2 = mgr.get_numeric_data(copy=True)
tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"]))
numeric2.iset(
numeric2.items.get_loc("float"), np.array([1000.0, 2000.0, 3000.0])
numeric2.items.get_loc("float"),
np.array([1000.0, 2000.0, 3000.0]),
inplace=True,
)
tm.assert_almost_equal(
mgr.iget(mgr.items.get_loc("float")).internal_values(),
Expand All @@ -781,7 +787,7 @@ def test_get_bool_data(self):
bools.iget(bools.items.get_loc("bool")).internal_values(),
)

bools.iset(0, np.array([True, False, True]))
bools.iset(0, np.array([True, False, True]), inplace=True)
tm.assert_numpy_array_equal(
mgr.iget(mgr.items.get_loc("bool")).internal_values(),
np.array([True, False, True]),
Expand Down
17 changes: 2 additions & 15 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,21 +308,8 @@ def test_merge_nocopy(self, using_array_manager):

merged = merge(left, right, left_index=True, right_index=True, copy=False)

if using_array_manager:
# With ArrayManager, setting a column doesn't change the values inplace
# and thus does not propagate the changes to the original left/right
# dataframes -> need to check that no copy was made in a different way
# TODO(ArrayManager) we should be able to simplify this with a .loc
# setitem test: merged.loc[0, "a"] = 10; assert left.loc[0, "a"] == 10
# but this currently replaces the array (_setitem_with_indexer_split_path)
assert merged._mgr.arrays[0] is left._mgr.arrays[0]
assert merged._mgr.arrays[2] is right._mgr.arrays[0]
else:
merged["a"] = 6
assert (left["a"] == 6).all()

merged["d"] = "peekaboo"
assert (right["d"] == "peekaboo").all()
assert np.shares_memory(merged["a"]._values, left["a"]._values)
assert np.shares_memory(merged["d"]._values, right["d"]._values)

def test_intelligently_handle_join_key(self):
# #733, be a bit more 1337 about not returning unconsolidated DataFrame
Expand Down