BUG: DataFrame.__setitem__ sometimes operating inplace (#43406)

pandas-dev · Nov 1, 2021 · 03dd698 · 03dd698
1 parent 33fffdb
commit 03dd698
Show file tree

Hide file tree

Showing 14 changed files with 115 additions and 63 deletions.
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -531,6 +531,7 @@ Indexing
 - Bug in :meth:`DataFrame.query` where method calls in query strings led to errors when the ``numexpr`` package was installed. (:issue:`22435`)
 - Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`)
 - Bug in indexing on a non-unique object-dtype :class:`Index` with an NA scalar (e.g. ``np.nan``) (:issue:`43711`)
+- Bug in :meth:`DataFrame.__setitem__` incorrectly writing into an existing column's array rather than setting a new array when the new dtype and the old dtype match (:issue:`43406`)
 - Bug in :meth:`Series.__setitem__` with object dtype when setting an array with matching size and dtype='datetime64[ns]' or dtype='timedelta64[ns]' incorrectly converting the datetime/timedeltas to integers (:issue:`43868`)
 - Bug in :meth:`DataFrame.sort_index` where ``ignore_index=True`` was not being respected when the index was already sorted (:issue:`43591`)
 - Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.datetime64("NaT")`` and ``np.timedelta64("NaT")`` (:issue:`43869`)

diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3840,9 +3840,11 @@ def _set_item_frame_value(self, key, value: DataFrame) -> None:
         arraylike = _reindex_for_setitem(value, self.index)
         self._set_item_mgr(key, arraylike)
 
-    def _iset_item_mgr(self, loc: int | slice | np.ndarray, value) -> None:
+    def _iset_item_mgr(
+        self, loc: int | slice | np.ndarray, value, inplace: bool = False
+    ) -> None:
         # when called from _set_item_mgr loc can be anything returned from get_loc
-        self._mgr.iset(loc, value)
+        self._mgr.iset(loc, value, inplace=inplace)
         self._clear_item_cache()
 
     def _set_item_mgr(self, key, value: ArrayLike) -> None:
@@ -3860,9 +3862,9 @@ def _set_item_mgr(self, key, value: ArrayLike) -> None:
         if len(self):
             self._check_setitem_copy()
 
-    def _iset_item(self, loc: int, value) -> None:
+    def _iset_item(self, loc: int, value, inplace: bool = False) -> None:
         arraylike = self._sanitize_column(value)
-        self._iset_item_mgr(loc, arraylike)
+        self._iset_item_mgr(loc, arraylike, inplace=inplace)
 
         # check if we are modifying a copy
         # try to set first as we want an invalid
@@ -3994,13 +3996,13 @@ def _reset_cacher(self) -> None:
         # no-op for DataFrame
         pass
 
-    def _maybe_cache_changed(self, item, value: Series) -> None:
+    def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None:
         """
         The object has called back to us saying maybe it has changed.
         """
         loc = self._info_axis.get_loc(item)
         arraylike = value._values
-        self._mgr.iset(loc, arraylike)
+        self._mgr.iset(loc, arraylike, inplace=inplace)
 
     # ----------------------------------------------------------------------
     # Unsorted

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -3531,7 +3531,10 @@ def _reset_cacher(self) -> None:
         raise AbstractMethodError(self)
 
     def _maybe_update_cacher(
-        self, clear: bool_t = False, verify_is_copy: bool_t = True
+        self,
+        clear: bool_t = False,
+        verify_is_copy: bool_t = True,
+        inplace: bool_t = False,
     ) -> None:
         """
         See if we need to update our parent cacher if clear, then clear our

diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py
@@ -1862,10 +1862,10 @@ def _setitem_single_column(self, loc: int, value, plane_indexer):
             # set the item, possibly having a dtype change
             ser = ser.copy()
             ser._mgr = ser._mgr.setitem(indexer=(pi,), value=value)
-            ser._maybe_update_cacher(clear=True)
+            ser._maybe_update_cacher(clear=True, inplace=True)
 
         # reset the sliced object if unique
-        self.obj._iset_item(loc, ser)
+        self.obj._iset_item(loc, ser, inplace=True)
 
     def _setitem_single_block(self, indexer, value, name: str):
         """
@@ -1890,9 +1890,10 @@ def _setitem_single_block(self, indexer, value, name: str):
                     if i != info_axis
                 )
             ):
-                selected_item_labels = item_labels[indexer[info_axis]]
-                if len(item_labels.get_indexer_for([selected_item_labels])) == 1:
-                    self.obj[selected_item_labels] = value
+                col = item_labels[indexer[info_axis]]
+                if len(item_labels.get_indexer_for([col])) == 1:
+                    loc = item_labels.get_loc(col)
+                    self.obj._iset_item(loc, value, inplace=True)
                     return
 
             indexer = maybe_convert_ix(*indexer)
@@ -1910,7 +1911,7 @@ def _setitem_single_block(self, indexer, value, name: str):
 
         # actually do the set
         self.obj._mgr = self.obj._mgr.setitem(indexer=indexer, value=value)
-        self.obj._maybe_update_cacher(clear=True)
+        self.obj._maybe_update_cacher(clear=True, inplace=True)
 
     def _setitem_with_indexer_missing(self, indexer, value):
         """

diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -792,7 +792,9 @@ def column_arrays(self) -> list[ArrayLike]:
         """
         return self.arrays
 
-    def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
+    def iset(
+        self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
+    ):
         """
         Set new column(s).
 
@@ -804,6 +806,8 @@ def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
         loc : integer, slice or boolean mask
             Positional location (already bounds checked)
         value : np.ndarray or ExtensionArray
+        inplace : bool, default False
+            Whether overwrite existing array as opposed to replacing it.
         """
         # single column -> single integer index
         if lib.is_integer(loc):

diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py
@@ -1032,7 +1032,9 @@ def column_arrays(self) -> list[np.ndarray]:
         # expected "List[ndarray[Any, Any]]")
         return result  # type: ignore[return-value]
 
-    def iset(self, loc: int | slice | np.ndarray, value: ArrayLike):
+    def iset(
+        self, loc: int | slice | np.ndarray, value: ArrayLike, inplace: bool = False
+    ):
         """
         Set new item in-place. Does not consolidate. Adds new Block if not
         contained in the current set of items
@@ -1087,7 +1089,7 @@ def value_getitem(placement):
         for blkno, val_locs in libinternals.get_blkno_placements(blknos, group=True):
             blk = self.blocks[blkno]
             blk_locs = blklocs[val_locs.indexer]
-            if blk.should_store(value):
+            if inplace and blk.should_store(value):
                 blk.set_inplace(blk_locs, value_getitem(val_locs))
             else:
                 unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs])

diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1226,7 +1226,7 @@ def _check_is_chained_assignment_possible(self) -> bool:
         return super()._check_is_chained_assignment_possible()
 
     def _maybe_update_cacher(
-        self, clear: bool = False, verify_is_copy: bool = True
+        self, clear: bool = False, verify_is_copy: bool = True, inplace: bool = False
     ) -> None:
         """
         See NDFrame._maybe_update_cacher.__doc__
@@ -1244,13 +1244,15 @@ def _maybe_update_cacher(
                 # GH#42530 self.name must be in ref.columns
                 # to ensure column still in dataframe
                 # otherwise, either self or ref has swapped in new arrays
-                ref._maybe_cache_changed(cacher[0], self)
+                ref._maybe_cache_changed(cacher[0], self, inplace=inplace)
             else:
                 # GH#33675 we have swapped in a new array, so parent
                 #  reference to self is now invalid
                 ref._item_cache.pop(cacher[0], None)
 
-        super()._maybe_update_cacher(clear=clear, verify_is_copy=verify_is_copy)
+        super()._maybe_update_cacher(
+            clear=clear, verify_is_copy=verify_is_copy, inplace=inplace
+        )
 
     # ----------------------------------------------------------------------
     # Unsorted

diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
@@ -537,16 +537,19 @@ def test_getitem_setitem_integer_slice_keyerrors(self):
 
     @td.skip_array_manager_invalid_test  # already covered in test_iloc_col_slice_view
     def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame):
+
         sliced = float_string_frame.iloc[:, -3:]
         assert sliced["D"].dtype == np.float64
 
         # get view with single block
         # setting it triggers setting with copy
         sliced = float_frame.iloc[:, -3:]
 
+        assert np.shares_memory(sliced["C"]._values, float_frame["C"]._values)
+
         msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
         with pytest.raises(com.SettingWithCopyError, match=msg):
-            sliced["C"] = 4.0
+            sliced.loc[:, "C"] = 4.0
 
         assert (float_frame["C"] == 4).all()
 
@@ -1002,9 +1005,11 @@ def test_iloc_row_slice_view(self, using_array_manager):
         # setting it makes it raise/warn
         subset = df.iloc[slice(4, 8)]
 
+        assert np.shares_memory(df[2], subset[2])
+
         msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
         with pytest.raises(com.SettingWithCopyError, match=msg):
-            subset[2] = 0.0
+            subset.loc[:, 2] = 0.0
 
         exp_col = original[2].copy()
         # TODO(ArrayManager) verify it is expected that the original didn't change
@@ -1041,10 +1046,13 @@ def test_iloc_col_slice_view(self, using_array_manager):
 
         if not using_array_manager:
             # verify slice is view
+
+            assert np.shares_memory(df[8]._values, subset[8]._values)
+
             # and that we are setting a copy
             msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame"
             with pytest.raises(com.SettingWithCopyError, match=msg):
-                subset[8] = 0.0
+                subset.loc[:, 8] = 0.0
 
             assert (df[8] == 0).all()
         else:

diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
@@ -1028,12 +1028,6 @@ def test_setitem_duplicate_columns_not_inplace(self):
     )
     def test_setitem_same_dtype_not_inplace(self, value, using_array_manager, request):
         # GH#39510
-        if not using_array_manager:
-            mark = pytest.mark.xfail(
-                reason="Setitem with same dtype still changing inplace"
-            )
-            request.node.add_marker(mark)
-
         cols = ["A", "B"]
         df = DataFrame(0, index=[0, 1], columns=cols)
         df_copy = df.copy()
@@ -1056,3 +1050,34 @@ def test_setitem_listlike_key_scalar_value_not_inplace(self, value):
         expected = DataFrame([[0, 1.0], [0, 1.0]], columns=cols)
         tm.assert_frame_equal(df_view, df_copy)
         tm.assert_frame_equal(df, expected)
+
+    @pytest.mark.parametrize(
+        "indexer",
+        [
+            "a",
+            ["a"],
+            pytest.param(
+                [True, False],
+                marks=pytest.mark.xfail(
+                    reason="Boolean indexer incorrectly setting inplace",
+                    strict=False,  # passing on some builds, no obvious pattern
+                ),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "value, set_value",
+        [
+            (1, 5),
+            (1.0, 5.0),
+            (Timestamp("2020-12-31"), Timestamp("2021-12-31")),
+            ("a", "b"),
+        ],
+    )
+    def test_setitem_not_operating_inplace(self, value, set_value, indexer):
+        # GH#43406
+        df = DataFrame({"a": value}, index=[0, 1])
+        expected = df.copy()
+        view = df[:]
+        df[indexer] = set_value
+        tm.assert_frame_equal(view, expected)
diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py
@@ -173,7 +173,10 @@ def test_rename_multiindex(self):
     @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) setitem copy/view
     def test_rename_nocopy(self, float_frame):
         renamed = float_frame.rename(columns={"C": "foo"}, copy=False)
-        renamed["foo"] = 1.0
+
+        assert np.shares_memory(renamed["foo"]._values, float_frame["C"]._values)
+
+        renamed.loc[:, "foo"] = 1.0
         assert (float_frame["C"] == 1.0).all()
 
     def test_rename_inplace(self, float_frame):

diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
@@ -824,20 +824,24 @@ def test_iloc_empty_list_indexer_is_ok(self):
             df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True
         )
 
-    def test_identity_slice_returns_new_object(self, using_array_manager):
+    def test_identity_slice_returns_new_object(self, using_array_manager, request):
         # GH13873
+        if using_array_manager:
+            mark = pytest.mark.xfail(
+                reason="setting with .loc[:, 'a'] does not alter inplace"
+            )
+            request.node.add_marker(mark)
+
         original_df = DataFrame({"a": [1, 2, 3]})
         sliced_df = original_df.iloc[:]
         assert sliced_df is not original_df
 
         # should be a shallow copy
-        original_df["a"] = [4, 4, 4]
-        if using_array_manager:
-            # TODO(ArrayManager) verify it is expected that the original didn't change
-            # setitem is replacing full column, so doesn't update "viewing" dataframe
-            assert not (sliced_df["a"] == 4).all()
-        else:
-            assert (sliced_df["a"] == 4).all()
+        assert np.shares_memory(original_df["a"], sliced_df["a"])
+
+        # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig
+        original_df.loc[:, "a"] = [4, 4, 4]
+        assert (sliced_df["a"] == 4).all()
 
         original_series = Series([1, 2, 3, 4, 5, 6])
         sliced_series = original_series.iloc[:]

diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
@@ -1002,21 +1002,25 @@ def test_loc_empty_list_indexer_is_ok(self):
             df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True
         )
 
-    def test_identity_slice_returns_new_object(self, using_array_manager):
+    def test_identity_slice_returns_new_object(self, using_array_manager, request):
         # GH13873
+        if using_array_manager:
+            mark = pytest.mark.xfail(
+                reason="setting with .loc[:, 'a'] does not alter inplace"
+            )
+            request.node.add_marker(mark)
+
         original_df = DataFrame({"a": [1, 2, 3]})
         sliced_df = original_df.loc[:]
         assert sliced_df is not original_df
         assert original_df[:] is not original_df
 
         # should be a shallow copy
-        original_df["a"] = [4, 4, 4]
-        if using_array_manager:
-            # TODO(ArrayManager) verify it is expected that the original didn't change
-            # setitem is replacing full column, so doesn't update "viewing" dataframe
-            assert not (sliced_df["a"] == 4).all()
-        else:
-            assert (sliced_df["a"] == 4).all()
+        assert np.shares_memory(original_df["a"]._values, sliced_df["a"]._values)
+
+        # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig
+        original_df.loc[:, "a"] = [4, 4, 4]
+        assert (sliced_df["a"] == 4).all()
 
         # These should not return copies
         assert original_df is original_df.loc[:, :]

diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py
@@ -750,7 +750,11 @@ def test_get_numeric_data(self):
         )
 
         # Check sharing
-        numeric.iset(numeric.items.get_loc("float"), np.array([100.0, 200.0, 300.0]))
+        numeric.iset(
+            numeric.items.get_loc("float"),
+            np.array([100.0, 200.0, 300.0]),
+            inplace=True,
+        )
         tm.assert_almost_equal(
             mgr.iget(mgr.items.get_loc("float")).internal_values(),
             np.array([100.0, 200.0, 300.0]),
@@ -759,7 +763,9 @@ def test_get_numeric_data(self):
         numeric2 = mgr.get_numeric_data(copy=True)
         tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"]))
         numeric2.iset(
-            numeric2.items.get_loc("float"), np.array([1000.0, 2000.0, 3000.0])
+            numeric2.items.get_loc("float"),
+            np.array([1000.0, 2000.0, 3000.0]),
+            inplace=True,
         )
         tm.assert_almost_equal(
             mgr.iget(mgr.items.get_loc("float")).internal_values(),
@@ -781,7 +787,7 @@ def test_get_bool_data(self):
             bools.iget(bools.items.get_loc("bool")).internal_values(),
         )
 
-        bools.iset(0, np.array([True, False, True]))
+        bools.iset(0, np.array([True, False, True]), inplace=True)
         tm.assert_numpy_array_equal(
             mgr.iget(mgr.items.get_loc("bool")).internal_values(),
             np.array([True, False, True]),

diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
@@ -308,21 +308,8 @@ def test_merge_nocopy(self, using_array_manager):
 
         merged = merge(left, right, left_index=True, right_index=True, copy=False)
 
-        if using_array_manager:
-            # With ArrayManager, setting a column doesn't change the values inplace
-            # and thus does not propagate the changes to the original left/right
-            # dataframes -> need to check that no copy was made in a different way
-            # TODO(ArrayManager) we should be able to simplify this with a .loc
-            #  setitem test: merged.loc[0, "a"] = 10; assert left.loc[0, "a"] == 10
-            #  but this currently replaces the array (_setitem_with_indexer_split_path)
-            assert merged._mgr.arrays[0] is left._mgr.arrays[0]
-            assert merged._mgr.arrays[2] is right._mgr.arrays[0]
-        else:
-            merged["a"] = 6
-            assert (left["a"] == 6).all()
-
-            merged["d"] = "peekaboo"
-            assert (right["d"] == "peekaboo").all()
+        assert np.shares_memory(merged["a"]._values, left["a"]._values)
+        assert np.shares_memory(merged["d"]._values, right["d"]._values)
 
     def test_intelligently_handle_join_key(self):
         # #733, be a bit more 1337 about not returning unconsolidated DataFrame