From b858de02798dc777fe2dd2bf261c9e037c74a4a9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 1 Nov 2022 20:46:43 +0100 Subject: [PATCH] BUG: CoW - correctly track references for chained operations (#48996) Co-authored-by: Patrick Hoefler <61934744+phofl@users.noreply.github.com> --- doc/source/whatsnew/v1.5.2.rst | 2 +- pandas/_libs/internals.pyx | 12 +- pandas/core/internals/managers.py | 55 ++++++-- pandas/tests/copy_view/test_indexing.py | 152 +++++++++++++++++++++++ pandas/tests/copy_view/test_internals.py | 19 +++ pandas/tests/copy_view/test_methods.py | 49 +++++++- 6 files changed, 270 insertions(+), 19 deletions(-) diff --git a/doc/source/whatsnew/v1.5.2.rst b/doc/source/whatsnew/v1.5.2.rst index c95f563a77708..e65be3bcecd76 100644 --- a/doc/source/whatsnew/v1.5.2.rst +++ b/doc/source/whatsnew/v1.5.2.rst @@ -23,7 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in the Copy-on-Write implementation losing track of views in certain chained indexing cases (:issue:`48996`) - .. --------------------------------------------------------------------------- diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 1a98633908a49..7f0f91652ae0d 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -676,8 +676,9 @@ cdef class BlockManager: public bint _known_consolidated, _is_consolidated public ndarray _blknos, _blklocs public list refs + public object parent - def __cinit__(self, blocks=None, axes=None, refs=None, verify_integrity=True): + def __cinit__(self, blocks=None, axes=None, refs=None, parent=None, verify_integrity=True): # None as defaults for unpickling GH#42345 if blocks is None: # This adds 1-2 microseconds to DataFrame(np.array([])) @@ -690,6 +691,7 @@ cdef class BlockManager: self.blocks = blocks self.axes = axes.copy() # copy to make sure we are not remotely-mutable self.refs = refs + self.parent = parent # Populate known_consolidate, blknos, and blklocs lazily self._known_consolidated = False @@ -805,7 +807,9 @@ cdef class BlockManager: nrefs.append(weakref.ref(blk)) new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)] - mgr = type(self)(tuple(nbs), new_axes, nrefs, verify_integrity=False) + mgr = type(self)( + tuple(nbs), new_axes, nrefs, parent=self, verify_integrity=False + ) # We can avoid having to rebuild blklocs/blknos blklocs = self._blklocs @@ -827,4 +831,6 @@ cdef class BlockManager: new_axes = list(self.axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - return type(self)(tuple(new_blocks), new_axes, new_refs, verify_integrity=False) + return type(self)( + tuple(new_blocks), new_axes, new_refs, parent=self, verify_integrity=False + ) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f21c02a7823ae..81a161fdd2417 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -57,6 +57,7 @@ import pandas.core.algorithms as algos from pandas.core.arrays._mixins import NDArrayBackedExtensionArray from pandas.core.arrays.sparse import SparseDtype +import pandas.core.common as com from pandas.core.construction import ( ensure_wrapped_if_datetimelike, extract_array, @@ -148,6 +149,7 @@ class BaseBlockManager(DataManager): blocks: tuple[Block, ...] axes: list[Index] refs: list[weakref.ref | None] | None + parent: object @property def ndim(self) -> int: @@ -165,6 +167,7 @@ def from_blocks( blocks: list[Block], axes: list[Index], refs: list[weakref.ref | None] | None = None, + parent: object = None, ) -> T: raise NotImplementedError @@ -264,6 +267,8 @@ def _clear_reference_block(self, blkno: int) -> None: """ if self.refs is not None: self.refs[blkno] = None + if com.all_none(*self.refs): + self.parent = None def get_dtypes(self): dtypes = np.array([blk.dtype for blk in self.blocks]) @@ -605,7 +610,9 @@ def _combine( axes[-1] = index axes[0] = self.items.take(indexer) - return type(self).from_blocks(new_blocks, axes, new_refs) + return type(self).from_blocks( + new_blocks, axes, new_refs, parent=None if copy else self + ) @property def nblocks(self) -> int: @@ -648,11 +655,14 @@ def copy_func(ax): new_refs: list[weakref.ref | None] | None if deep: new_refs = None + parent = None else: new_refs = [weakref.ref(blk) for blk in self.blocks] + parent = self res.axes = new_axes res.refs = new_refs + res.parent = parent if self.ndim > 1: # Avoid needing to re-compute these @@ -744,6 +754,7 @@ def reindex_indexer( only_slice=only_slice, use_na_proxy=use_na_proxy, ) + parent = None if com.all_none(*new_refs) else self else: new_blocks = [ blk.take_nd( @@ -756,11 +767,12 @@ def reindex_indexer( for blk in self.blocks ] new_refs = None + parent = None new_axes = list(self.axes) new_axes[axis] = new_axis - new_mgr = type(self).from_blocks(new_blocks, new_axes, new_refs) + new_mgr = type(self).from_blocks(new_blocks, new_axes, new_refs, parent=parent) if axis == 1: # We can avoid the need to rebuild these new_mgr._blknos = self.blknos.copy() @@ -995,6 +1007,7 @@ def __init__( blocks: Sequence[Block], axes: Sequence[Index], refs: list[weakref.ref | None] | None = None, + parent: object = None, verify_integrity: bool = True, ) -> None: @@ -1059,11 +1072,13 @@ def from_blocks( blocks: list[Block], axes: list[Index], refs: list[weakref.ref | None] | None = None, + parent: object = None, ) -> BlockManager: """ Constructor for BlockManager and SingleBlockManager with same signature. """ - return cls(blocks, axes, refs, verify_integrity=False) + parent = parent if _using_copy_on_write() else None + return cls(blocks, axes, refs, parent, verify_integrity=False) # ---------------------------------------------------------------- # Indexing @@ -1085,7 +1100,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager: block = new_block(result, placement=slice(0, len(result)), ndim=1) # in the case of a single block, the new block is a view ref = weakref.ref(self.blocks[0]) - return SingleBlockManager(block, self.axes[0], [ref]) + return SingleBlockManager(block, self.axes[0], [ref], parent=self) dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) @@ -1119,7 +1134,7 @@ def fast_xs(self, loc: int) -> SingleBlockManager: block = new_block(result, placement=slice(0, len(result)), ndim=1) return SingleBlockManager(block, self.axes[0]) - def iget(self, i: int) -> SingleBlockManager: + def iget(self, i: int, track_ref: bool = True) -> SingleBlockManager: """ Return the data as a SingleBlockManager. """ @@ -1129,7 +1144,9 @@ def iget(self, i: int) -> SingleBlockManager: # shortcut for select a single-dim from a 2-dim BM bp = BlockPlacement(slice(0, len(values))) nb = type(block)(values, placement=bp, ndim=1) - return SingleBlockManager(nb, self.axes[1], [weakref.ref(block)]) + ref = weakref.ref(block) if track_ref else None + parent = self if track_ref else None + return SingleBlockManager(nb, self.axes[1], [ref], parent) def iget_values(self, i: int) -> ArrayLike: """ @@ -1371,7 +1388,9 @@ def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value) -> None self.blocks = tuple(blocks) self._clear_reference_block(blkno) - col_mgr = self.iget(loc) + # this manager is only created temporarily to mutate the values in place + # so don't track references, otherwise the `setitem` would perform CoW again + col_mgr = self.iget(loc, track_ref=False) new_mgr = col_mgr.setitem((idx,), value) self.iset(loc, new_mgr._block.values, inplace=True) @@ -1469,7 +1488,9 @@ def idelete(self, indexer) -> BlockManager: nbs, new_refs = self._slice_take_blocks_ax0(taker, only_slice=True) new_columns = self.items[~is_deleted] axes = [new_columns, self.axes[1]] - return type(self)(tuple(nbs), axes, new_refs, verify_integrity=False) + # TODO this might not be needed (can a delete ever be done in chained manner?) + parent = None if com.all_none(*new_refs) else self + return type(self)(tuple(nbs), axes, new_refs, parent, verify_integrity=False) # ---------------------------------------------------------------- # Block-wise Operation @@ -1875,6 +1896,7 @@ def __init__( block: Block, axis: Index, refs: list[weakref.ref | None] | None = None, + parent: object = None, verify_integrity: bool = False, fastpath=lib.no_default, ) -> None: @@ -1893,6 +1915,7 @@ def __init__( self.axes = [axis] self.blocks = (block,) self.refs = refs + self.parent = parent if _using_copy_on_write() else None @classmethod def from_blocks( @@ -1900,6 +1923,7 @@ def from_blocks( blocks: list[Block], axes: list[Index], refs: list[weakref.ref | None] | None = None, + parent: object = None, ) -> SingleBlockManager: """ Constructor for BlockManager and SingleBlockManager with same signature. @@ -1908,7 +1932,7 @@ def from_blocks( assert len(axes) == 1 if refs is not None: assert len(refs) == 1 - return cls(blocks[0], axes[0], refs, verify_integrity=False) + return cls(blocks[0], axes[0], refs, parent, verify_integrity=False) @classmethod def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager: @@ -1928,7 +1952,10 @@ def to_2d_mgr(self, columns: Index) -> BlockManager: new_blk = type(blk)(arr, placement=bp, ndim=2) axes = [columns, self.axes[0]] refs: list[weakref.ref | None] = [weakref.ref(blk)] - return BlockManager([new_blk], axes=axes, refs=refs, verify_integrity=False) + parent = self if _using_copy_on_write() else None + return BlockManager( + [new_blk], axes=axes, refs=refs, parent=parent, verify_integrity=False + ) def _has_no_reference(self, i: int = 0) -> bool: """ @@ -2010,7 +2037,7 @@ def getitem_mgr(self, indexer: slice | npt.NDArray[np.bool_]) -> SingleBlockMana new_idx = self.index[indexer] # TODO(CoW) in theory only need to track reference if new_array is a view ref = weakref.ref(blk) - return type(self)(block, new_idx, [ref]) + return type(self)(block, new_idx, [ref], parent=self) def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager: # Assertion disabled for performance @@ -2023,7 +2050,9 @@ def get_slice(self, slobj: slice, axis: AxisInt = 0) -> SingleBlockManager: bp = BlockPlacement(slice(0, len(array))) block = type(blk)(array, placement=bp, ndim=1) new_index = self.index._getitem_slice(slobj) - return type(self)(block, new_index, [weakref.ref(blk)]) + # TODO this method is only used in groupby SeriesSplitter at the moment, + # so passing refs / parent is not yet covered by the tests + return type(self)(block, new_index, [weakref.ref(blk)], parent=self) @property def index(self) -> Index: @@ -2070,6 +2099,7 @@ def setitem_inplace(self, indexer, value) -> None: if _using_copy_on_write() and not self._has_no_reference(0): self.blocks = (self._block.copy(),) self.refs = None + self.parent = None self._cache.clear() super().setitem_inplace(indexer, value) @@ -2086,6 +2116,7 @@ def idelete(self, indexer) -> SingleBlockManager: self._cache.clear() # clear reference since delete always results in a new array self.refs = None + self.parent = None return self def fast_xs(self, loc): diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 6abd54db3bf4a..b8028fd28f8f8 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -460,6 +460,158 @@ def test_subset_set_with_column_indexer( tm.assert_frame_equal(df, df_orig) +@pytest.mark.parametrize( + "method", + [ + lambda df: df[["a", "b"]][0:2], + lambda df: df[0:2][["a", "b"]], + lambda df: df[["a", "b"]].iloc[0:2], + lambda df: df[["a", "b"]].loc[0:1], + lambda df: df[0:2].iloc[:, 0:2], + lambda df: df[0:2].loc[:, "a":"b"], # type: ignore[misc] + ], + ids=[ + "row-getitem-slice", + "column-getitem", + "row-iloc-slice", + "row-loc-slice", + "column-iloc-slice", + "column-loc-slice", + ], +) +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_subset_chained_getitem( + request, method, dtype, using_copy_on_write, using_array_manager +): + # Case: creating a subset using multiple, chained getitem calls using views + # still needs to guarantee proper CoW behaviour + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + + # when not using CoW, it depends on whether we have a single block or not + # and whether we are slicing the columns -> in that case we have a view + subset_is_view = request.node.callspec.id in ( + "single-block-column-iloc-slice", + "single-block-column-loc-slice", + ) or ( + request.node.callspec.id + in ("mixed-block-column-iloc-slice", "mixed-block-column-loc-slice") + and using_array_manager + ) + + # modify subset -> don't modify parent + subset = method(df) + subset.iloc[0, 0] = 0 + if using_copy_on_write or (not subset_is_view): + tm.assert_frame_equal(df, df_orig) + else: + assert df.iloc[0, 0] == 0 + + # modify parent -> don't modify subset + subset = method(df) + df.iloc[0, 0] = 0 + expected = DataFrame({"a": [1, 2], "b": [4, 5]}) + if using_copy_on_write or not subset_is_view: + tm.assert_frame_equal(subset, expected) + else: + assert subset.iloc[0, 0] == 0 + + +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_subset_chained_getitem_column(dtype, using_copy_on_write): + # Case: creating a subset using multiple, chained getitem calls using views + # still needs to guarantee proper CoW behaviour + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + + # modify subset -> don't modify parent + subset = df[:]["a"][0:2] + df._clear_item_cache() + subset.iloc[0] = 0 + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + else: + assert df.iloc[0, 0] == 0 + + # modify parent -> don't modify subset + subset = df[:]["a"][0:2] + df._clear_item_cache() + df.iloc[0, 0] = 0 + expected = Series([1, 2], name="a") + if using_copy_on_write: + tm.assert_series_equal(subset, expected) + else: + assert subset.iloc[0] == 0 + + +@pytest.mark.parametrize( + "method", + [ + lambda s: s["a":"c"]["a":"b"], # type: ignore[misc] + lambda s: s.iloc[0:3].iloc[0:2], + lambda s: s.loc["a":"c"].loc["a":"b"], # type: ignore[misc] + lambda s: s.loc["a":"c"] # type: ignore[misc] + .iloc[0:3] + .iloc[0:2] + .loc["a":"b"] # type: ignore[misc] + .iloc[0:1], + ], + ids=["getitem", "iloc", "loc", "long-chain"], +) +def test_subset_chained_getitem_series(method, using_copy_on_write): + # Case: creating a subset using multiple, chained getitem calls using views + # still needs to guarantee proper CoW behaviour + s = Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + + # modify subset -> don't modify parent + subset = method(s) + subset.iloc[0] = 0 + if using_copy_on_write: + tm.assert_series_equal(s, s_orig) + else: + assert s.iloc[0] == 0 + + # modify parent -> don't modify subset + subset = s.iloc[0:3].iloc[0:2] + s.iloc[0] = 0 + expected = Series([1, 2], index=["a", "b"]) + if using_copy_on_write: + tm.assert_series_equal(subset, expected) + else: + assert subset.iloc[0] == 0 + + +def test_subset_chained_single_block_row(using_copy_on_write, using_array_manager): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + df_orig = df.copy() + + # modify subset -> don't modify parent + subset = df[:].iloc[0].iloc[0:2] + subset.iloc[0] = 0 + if using_copy_on_write or using_array_manager: + tm.assert_frame_equal(df, df_orig) + else: + assert df.iloc[0, 0] == 0 + + # modify parent -> don't modify subset + subset = df[:].iloc[0].iloc[0:2] + df.iloc[0, 0] = 0 + expected = Series([1, 4], index=["a", "b"], name=0) + if using_copy_on_write or using_array_manager: + tm.assert_series_equal(subset, expected) + else: + assert subset.iloc[0] == 0 + + # TODO add more tests modifying the parent diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py index 2191fc1b33218..edfa7f843f17f 100644 --- a/pandas/tests/copy_view/test_internals.py +++ b/pandas/tests/copy_view/test_internals.py @@ -1,4 +1,5 @@ import numpy as np +import pytest import pandas.util._test_decorators as td @@ -43,3 +44,21 @@ def test_consolidate(using_copy_on_write): subset.iloc[0, 1] = 0.0 assert df._mgr._has_no_reference(1) assert df.loc[0, "b"] == 0.1 + + +@td.skip_array_manager_invalid_test +def test_clear_parent(using_copy_on_write): + # ensure to clear parent reference if we are no longer viewing data from parent + if not using_copy_on_write: + pytest.skip("test only relevant when using copy-on-write") + + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + subset = df[:] + assert subset._mgr.parent is not None + + # replacing existing columns loses the references to the parent df + subset["a"] = 0 + assert subset._mgr.parent is not None + # when losing the last reference, also the parent should be reset + subset["b"] = 0 + assert subset._mgr.parent is None diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index df723808ce06b..956e2cf98c9b6 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( DataFrame, @@ -156,7 +157,7 @@ def test_to_frame(using_copy_on_write): ser = Series([1, 2, 3]) ser_orig = ser.copy() - df = ser.to_frame() + df = ser[:].to_frame() # currently this always returns a "view" assert np.shares_memory(ser.values, get_array(df, 0)) @@ -169,5 +170,47 @@ def test_to_frame(using_copy_on_write): tm.assert_series_equal(ser, ser_orig) else: # but currently select_dtypes() actually returns a view -> mutates parent - ser_orig.iloc[0] = 0 - tm.assert_series_equal(ser, ser_orig) + expected = ser_orig.copy() + expected.iloc[0] = 0 + tm.assert_series_equal(ser, expected) + + # modify original series -> don't modify dataframe + df = ser[:].to_frame() + ser.iloc[0] = 0 + + if using_copy_on_write: + tm.assert_frame_equal(df, ser_orig.to_frame()) + else: + expected = ser_orig.copy().to_frame() + expected.iloc[0, 0] = 0 + tm.assert_frame_equal(df, expected) + + +@pytest.mark.parametrize( + "method, idx", + [ + (lambda df: df.copy(deep=False).copy(deep=False), 0), + (lambda df: df.reset_index().reset_index(), 2), + (lambda df: df.rename(columns=str.upper).rename(columns=str.lower), 0), + (lambda df: df.copy(deep=False).select_dtypes(include="number"), 0), + ], + ids=["shallow-copy", "reset_index", "rename", "select_dtypes"], +) +def test_chained_methods(request, method, idx, using_copy_on_write): + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + # when not using CoW, only the copy() variant actually gives a view + df2_is_view = not using_copy_on_write and request.node.callspec.id == "shallow-copy" + + # modify df2 -> don't modify df + df2 = method(df) + df2.iloc[0, idx] = 0 + if not df2_is_view: + tm.assert_frame_equal(df, df_orig) + + # modify df -> don't modify df2 + df2 = method(df) + df.iloc[0, 0] = 0 + if not df2_is_view: + tm.assert_frame_equal(df2.iloc[:, idx:], df_orig)