diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index c4398efb12c3df..b8268a82d9b701 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -52,6 +52,10 @@ jobs: extra_apt: "language-pack-zh-hans" lang: "zh_CN.utf8" lc_all: "zh_CN.utf8" + - name: "Copy-on-Write" + env_file: actions-310.yaml + pattern: "not slow and not network and not single_cpu" + pandas_copy_on_write: "1" - name: "Data Manager" env_file: actions-38.yaml pattern: "not slow and not network and not single_cpu" @@ -84,6 +88,7 @@ jobs: LC_ALL: ${{ matrix.lc_all || '' }} PANDAS_TESTING_MODE: ${{ matrix.pandas_testing_mode || '' }} PANDAS_DATA_MANAGER: ${{ matrix.pandas_data_manager || 'block' }} + PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} TEST_ARGS: ${{ matrix.test_args || '' }} PYTEST_WORKERS: ${{ contains(matrix.pattern, 'not single_cpu') && 'auto' || '1' }} PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 159fdbc080fb4b..94ae4a021da4d5 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -1,4 +1,5 @@ from collections import defaultdict +import weakref cimport cython from cpython.slice cimport PySlice_GetIndicesEx @@ -674,8 +675,9 @@ cdef class BlockManager: public list axes public bint _known_consolidated, _is_consolidated public ndarray _blknos, _blklocs + public list refs - def __cinit__(self, blocks=None, axes=None, verify_integrity=True): + def __cinit__(self, blocks=None, axes=None, refs=None, verify_integrity=True): # None as defaults for unpickling GH#42345 if blocks is None: # This adds 1-2 microseconds to DataFrame(np.array([])) @@ -687,6 +689,7 @@ cdef class BlockManager: self.blocks = blocks self.axes = axes.copy() # copy to make sure we are not remotely-mutable + self.refs = refs # Populate known_consolidate, blknos, and blklocs lazily self._known_consolidated = False @@ -795,12 +798,14 @@ cdef class BlockManager: ndarray blknos, blklocs nbs = [] + nrefs = [] for blk in self.blocks: nb = blk.getitem_block_index(slobj) nbs.append(nb) + nrefs.append(weakref.ref(blk)) new_axes = [self.axes[0], self.axes[1]._getitem_slice(slobj)] - mgr = type(self)(tuple(nbs), new_axes, verify_integrity=False) + mgr = type(self)(tuple(nbs), new_axes, nrefs, verify_integrity=False) # We can avoid having to rebuild blklocs/blknos blklocs = self._blklocs @@ -813,7 +818,7 @@ cdef class BlockManager: def get_slice(self, slobj: slice, axis: int = 0) -> BlockManager: if axis == 0: - new_blocks = self._slice_take_blocks_ax0(slobj) + new_blocks, new_refs = self._slice_take_blocks_ax0(slobj) elif axis == 1: return self._get_index_slice(slobj) else: @@ -822,4 +827,4 @@ cdef class BlockManager: new_axes = list(self.axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - return type(self)(tuple(new_blocks), new_axes, verify_integrity=False) + return type(self)(tuple(new_blocks), new_axes, new_refs, verify_integrity=False) diff --git a/pandas/compat/pickle_compat.py b/pandas/compat/pickle_compat.py index c8db82500d0d6d..813e8de72f96ed 100644 --- a/pandas/compat/pickle_compat.py +++ b/pandas/compat/pickle_compat.py @@ -224,7 +224,7 @@ def load_newobj(self): arr = np.array([], dtype="m8[ns]") obj = cls.__new__(cls, arr, arr.dtype) elif cls is BlockManager and not args: - obj = cls.__new__(cls, (), [], False) + obj = cls.__new__(cls, (), [], None, False) else: obj = cls.__new__(cls, *args) diff --git a/pandas/conftest.py b/pandas/conftest.py index 5f7b6d509c233f..b6e1559b9f8cf0 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -29,10 +29,7 @@ from decimal import Decimal import operator import os -from typing import ( - Callable, - Literal, -) +from typing import Callable from dateutil.tz import ( tzlocal, @@ -1838,8 +1835,8 @@ def using_array_manager(): @pytest.fixture -def using_copy_on_write() -> Literal[False]: +def using_copy_on_write() -> bool: """ Fixture to check if Copy-on-Write is enabled. """ - return False + return pd.options.mode.copy_on_write and pd.options.mode.data_manager == "block" diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 4434ed5a8b5f7e..2579f736a97030 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -540,6 +540,26 @@ def use_inf_as_na_cb(key) -> None: ) +# TODO better name? +copy_on_write_doc = """ +: bool + Use new copy-view behaviour using Copy-on-Write. Defaults to False, + unless overridden by the 'PANDAS_COPY_ON_WRITE' environment variable + (if set to "1" for True, needs to be set before pandas is imported). +""" + + +with cf.config_prefix("mode"): + cf.register_option( + "copy_on_write", + # Get the default from an environment variable, if set, otherwise defaults + # to False. This environment variable can be set for testing. + os.environ.get("PANDAS_COPY_ON_WRITE", "0") == "1", + copy_on_write_doc, + validator=is_bool, + ) + + # user warnings chained_assignment = """ : string diff --git a/pandas/core/frame.py b/pandas/core/frame.py index bcc7a2ae8f83f2..9042c163627264 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3708,6 +3708,9 @@ def _get_column_array(self, i: int) -> ArrayLike: """ Get the values of the i'th column (ndarray or ExtensionArray, as stored in the Block) + + Warning! The returned array is a view but doesn't handle Copy-on-Write, + so this should be used with caution (for read-only purposes). """ return self._mgr.iget_values(i) @@ -3715,6 +3718,9 @@ def _iter_column_arrays(self) -> Iterator[ArrayLike]: """ Iterate over the arrays of all columns in order. This returns the values as stored in the Block (ndarray or ExtensionArray). + + Warning! The returned array is a view but doesn't handle Copy-on-Write, + so this should be used with caution (for read-only purposes). """ for i in range(len(self.columns)): yield self._get_column_array(i) @@ -5147,7 +5153,7 @@ def set_axis( "labels", [ ("method", None), - ("copy", True), + ("copy", None), ("level", None), ("fill_value", np.nan), ("limit", None), @@ -5372,7 +5378,7 @@ def rename( index: Renamer | None = ..., columns: Renamer | None = ..., axis: Axis | None = ..., - copy: bool = ..., + copy: bool | None = ..., inplace: Literal[True], level: Level = ..., errors: IgnoreRaise = ..., @@ -5387,7 +5393,7 @@ def rename( index: Renamer | None = ..., columns: Renamer | None = ..., axis: Axis | None = ..., - copy: bool = ..., + copy: bool | None = ..., inplace: Literal[False] = ..., level: Level = ..., errors: IgnoreRaise = ..., @@ -5402,7 +5408,7 @@ def rename( index: Renamer | None = ..., columns: Renamer | None = ..., axis: Axis | None = ..., - copy: bool = ..., + copy: bool | None = ..., inplace: bool = ..., level: Level = ..., errors: IgnoreRaise = ..., @@ -5416,7 +5422,7 @@ def rename( index: Renamer | None = None, columns: Renamer | None = None, axis: Axis | None = None, - copy: bool = True, + copy: bool | None = None, inplace: bool = False, level: Level = None, errors: IgnoreRaise = "ignore", @@ -6290,7 +6296,7 @@ class max type if inplace: new_obj = self else: - new_obj = self.copy() + new_obj = self.copy(deep=None) if allow_duplicates is not lib.no_default: allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 342e4eae09257b..8ed3ba02991bcf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1046,7 +1046,7 @@ def _rename( index: Renamer | None = None, columns: Renamer | None = None, axis: Axis | None = None, - copy: bool_t = True, + copy: bool_t | None = None, inplace: bool_t = False, level: Level | None = None, errors: str = "ignore", @@ -4161,6 +4161,12 @@ def _check_setitem_copy(self, t="setting", force=False): df.iloc[0:5]['group'] = 'a' """ + if ( + config.get_option("mode.copy_on_write") + and config.get_option("mode.data_manager") == "block" + ): + return + # return early if the check is not needed if not (force or self._is_copy): return @@ -5261,7 +5267,7 @@ def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT: axes, kwargs = self._construct_axes_from_arguments(args, kwargs) method = missing.clean_reindex_fill_method(kwargs.pop("method", None)) level = kwargs.pop("level", None) - copy = kwargs.pop("copy", True) + copy = kwargs.pop("copy", None) limit = kwargs.pop("limit", None) tolerance = kwargs.pop("tolerance", None) fill_value = kwargs.pop("fill_value", None) @@ -5286,9 +5292,7 @@ def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT: for axis, ax in axes.items() if ax is not None ): - if copy: - return self.copy() - return self + return self.copy(deep=copy) # check if we are a multi reindex if self._needs_reindex_multi(axes, method, level): @@ -6265,7 +6269,7 @@ def astype( return cast(NDFrameT, result) @final - def copy(self: NDFrameT, deep: bool_t = True) -> NDFrameT: + def copy(self: NDFrameT, deep: bool_t | None = True) -> NDFrameT: """ Make a copy of this object's indices and data. diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 88f81064b826fe..dcf69dfda1ae83 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -527,6 +527,11 @@ def copy(self: T, deep=True) -> T: ------- BlockManager """ + if deep is None: + # ArrayManager does not yet support CoW, so deep=None always means + # deep=True for now + deep = True + # this preserves the notion of view copying of axes if deep: # hit in e.g. tests.io.json.test_pandas @@ -591,6 +596,11 @@ def _reindex_indexer( pandas-indexer with -1's only. """ + if copy is None: + # ArrayManager does not yet support CoW, so deep=None always means + # deep=True for now + copy = True + if indexer is None: if new_axis is self._axes[axis] and not copy: return self diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 69089cc64e6713..010358d3a21ec6 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -839,10 +839,13 @@ def _slice( return self.values[slicer] - def set_inplace(self, locs, values: ArrayLike) -> None: + def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None: """ Modify block values in-place with new item value. + If copy=True, first copy the underlying values in place before modifying + (for Copy-on-Write). + Notes ----- `set_inplace` never creates a new array or new Block, whereas `setitem` @@ -850,6 +853,8 @@ def set_inplace(self, locs, values: ArrayLike) -> None: Caller is responsible for checking values.dtype == self.dtype. """ + if copy: + self.values = self.values.copy() self.values[locs] = values def take_nd( @@ -1665,9 +1670,11 @@ def iget(self, i: int | tuple[int, int] | tuple[slice, int]): raise IndexError(f"{self} only contains one item") return self.values - def set_inplace(self, locs, values: ArrayLike) -> None: + def set_inplace(self, locs, values: ArrayLike, copy: bool = False) -> None: # When an ndarray, we should have locs.tolist() == [0] # When a BlockPlacement we should have list(locs) == [0] + if copy: + self.values = self.values.copy() self.values[:] = values def _maybe_squeeze_arg(self, arg): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 4e84b013b2a113..3084bcea49f05c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -12,9 +12,12 @@ cast, ) import warnings +import weakref import numpy as np +from pandas._config import get_option + from pandas._libs import ( algos as libalgos, internals as libinternals, @@ -143,6 +146,7 @@ class BaseBlockManager(DataManager): _blklocs: npt.NDArray[np.intp] blocks: tuple[Block, ...] axes: list[Index] + refs: list[weakref.ref | None] | None @property def ndim(self) -> int: @@ -151,11 +155,16 @@ def ndim(self) -> int: _known_consolidated: bool _is_consolidated: bool - def __init__(self, blocks, axes, verify_integrity: bool = True) -> None: + def __init__(self, blocks, axes, refs=None, verify_integrity: bool = True) -> None: raise NotImplementedError @classmethod - def from_blocks(cls: type_t[T], blocks: list[Block], axes: list[Index]) -> T: + def from_blocks( + cls: type_t[T], + blocks: list[Block], + axes: list[Index], + refs: list[weakref.ref | None] | None = None, + ) -> T: raise NotImplementedError @property @@ -228,6 +237,33 @@ def is_single_block(self) -> bool: def items(self) -> Index: return self.axes[0] + def _has_no_reference(self, i: int) -> bool: + """ + Check for column `i` if it has references. + (whether it references another array or is itself being referenced) + Returns True if the column has no references. + """ + blkno = self.blknos[i] + return self._has_no_reference_block(blkno) + + def _has_no_reference_block(self, blkno: int) -> bool: + """ + Check for block `i` if it has references. + (whether it references another array or is itself being referenced) + Returns True if the block has no references. + """ + # TODO(CoW) include `or self.refs[blkno]() is None` ? + return ( + self.refs is None or self.refs[blkno] is None + ) and weakref.getweakrefcount(self.blocks[blkno]) == 0 + + def _clear_reference_block(self, blkno: int) -> None: + """ + Clear any reference for column `i`. + """ + if self.refs is not None: + self.refs[blkno] = None + def get_dtypes(self): dtypes = np.array([blk.dtype for blk in self.blocks]) return dtypes.take(self.blknos) @@ -240,6 +276,9 @@ def arrays(self) -> list[ArrayLike]: Only for compatibility with ArrayManager for testing convenience. Not to be used in actual code, and return value is not the same as the ArrayManager method (list of 1D arrays vs iterator of 2D ndarrays / 1D EAs). + + Warning! The returned arrays don't handle Copy-on-Write, so this should + be used with caution (only in read-mode). """ return [blk.values for blk in self.blocks] @@ -342,9 +381,23 @@ def setitem(self: T, indexer, value) -> T: if isinstance(indexer, np.ndarray) and indexer.ndim > self.ndim: raise ValueError(f"Cannot set values with ndim > {self.ndim}") + if _using_copy_on_write() and not self._has_no_reference(0): + # if being referenced -> perform Copy-on-Write and clear the reference + # this method is only called if there is a single block -> hardcoded 0 + self = self.copy() + return self.apply("setitem", indexer=indexer, value=value) def putmask(self, mask, new, align: bool = True): + if ( + _using_copy_on_write() + and self.refs is not None + and not all(ref is None for ref in self.refs) + ): + # some reference -> copy full dataframe + # TODO(CoW) this could be optimized to only copy the blocks that would + # get modified + self = self.copy() if align: align_keys = ["new", "mask"] @@ -378,6 +431,12 @@ def fillna(self: T, value, limit, inplace: bool, downcast) -> T: if limit is not None: # Do this validation even if we go through one of the no-op paths limit = libalgos.validate_limit(None, limit=limit) + if inplace: + # TODO(CoW) can be optimized to only copy those blocks that have refs + if _using_copy_on_write() and any( + not self._has_no_reference_block(i) for i in range(len(self.blocks)) + ): + self = self.copy() return self.apply( "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast @@ -527,17 +586,24 @@ def _combine( inv_indexer = lib.get_reverse_indexer(indexer, self.shape[0]) new_blocks: list[Block] = [] + # TODO(CoW) we could optimize here if we know that the passed blocks + # are fully "owned" (eg created from an operation, not coming from + # an existing manager) + new_refs: list[weakref.ref | None] | None = None if copy else [] for b in blocks: - b = b.copy(deep=copy) - b.mgr_locs = BlockPlacement(inv_indexer[b.mgr_locs.indexer]) - new_blocks.append(b) + nb = b.copy(deep=copy) + nb.mgr_locs = BlockPlacement(inv_indexer[nb.mgr_locs.indexer]) + new_blocks.append(nb) + if not copy: + # None has no attribute "append" + new_refs.append(weakref.ref(b)) # type: ignore[union-attr] axes = list(self.axes) if index is not None: axes[-1] = index axes[0] = self.items.take(indexer) - return type(self).from_blocks(new_blocks, axes) + return type(self).from_blocks(new_blocks, axes, new_refs) @property def nblocks(self) -> int: @@ -549,14 +615,22 @@ def copy(self: T, deep=True) -> T: Parameters ---------- - deep : bool or string, default True - If False, return shallow copy (do not copy data) + deep : bool, string or None, default True + If False or None, return a shallow copy (do not copy data) If 'all', copy data and a deep copy of the index Returns ------- BlockManager """ + if deep is None: + if _using_copy_on_write(): + # use shallow copy + deep = False + else: + # preserve deep copy for BlockManager with copy=None + deep = True + # this preserves the notion of view copying of axes if deep: # hit in e.g. tests.io.json.test_pandas @@ -569,8 +643,14 @@ def copy_func(ax): new_axes = list(self.axes) res = self.apply("copy", deep=deep) + new_refs: list[weakref.ref | None] | None + if deep: + new_refs = None + else: + new_refs = [weakref.ref(blk) for blk in self.blocks] res.axes = new_axes + res.refs = new_refs if self.ndim > 1: # Avoid needing to re-compute these @@ -594,7 +674,7 @@ def consolidate(self: T) -> T: if self.is_consolidated(): return self - bm = type(self)(self.blocks, self.axes, verify_integrity=False) + bm = type(self)(self.blocks, self.axes, self.refs, verify_integrity=False) bm._is_consolidated = False bm._consolidate_inplace() return bm @@ -606,7 +686,7 @@ def reindex_indexer( axis: int, fill_value=None, allow_dups: bool = False, - copy: bool = True, + copy: bool | None = True, only_slice: bool = False, *, use_na_proxy: bool = False, @@ -619,7 +699,8 @@ def reindex_indexer( axis : int fill_value : object, default None allow_dups : bool, default False - copy : bool, default True + copy : bool or None, default True + If None, regard as False to get shallow copy. only_slice : bool, default False Whether to take views, not copies, along columns. use_na_proxy : bool, default False @@ -627,6 +708,14 @@ def reindex_indexer( pandas-indexer with -1's only. """ + if copy is None: + if _using_copy_on_write(): + # use shallow copy + copy = False + else: + # preserve deep copy for BlockManager with copy=None + copy = True + if indexer is None: if new_axis is self.axes[axis] and not copy: return self @@ -644,7 +733,7 @@ def reindex_indexer( raise IndexError("Requested axis not found in manager") if axis == 0: - new_blocks = self._slice_take_blocks_ax0( + new_blocks, new_refs = self._slice_take_blocks_ax0( indexer, fill_value=fill_value, only_slice=only_slice, @@ -661,11 +750,12 @@ def reindex_indexer( ) for blk in self.blocks ] + new_refs = None new_axes = list(self.axes) new_axes[axis] = new_axis - new_mgr = type(self).from_blocks(new_blocks, new_axes) + new_mgr = type(self).from_blocks(new_blocks, new_axes, new_refs) if axis == 1: # We can avoid the need to rebuild these new_mgr._blknos = self.blknos.copy() @@ -679,7 +769,7 @@ def _slice_take_blocks_ax0( only_slice: bool = False, *, use_na_proxy: bool = False, - ) -> list[Block]: + ) -> tuple[list[Block], list[weakref.ref | None]]: """ Slice/take blocks along axis=0. @@ -712,9 +802,11 @@ def _slice_take_blocks_ax0( # GH#32959 EABlock would fail since we can't make 0-width # TODO(EA2D): special casing unnecessary with 2D EAs if sllen == 0: - return [] + return [], [] bp = BlockPlacement(slice(0, sllen)) - return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)] + return [blk.getitem_block_columns(slobj, new_mgr_locs=bp)], [ + weakref.ref(blk) + ] elif not allow_fill or self.ndim == 1: if allow_fill and fill_value is None: fill_value = blk.fill_value @@ -730,7 +822,7 @@ def _slice_take_blocks_ax0( ] # We have # all(np.shares_memory(nb.values, blk.values) for nb in blocks) - return blocks + return blocks, [weakref.ref(blk)] * len(blocks) else: bp = BlockPlacement(slice(0, sllen)) return [ @@ -740,7 +832,7 @@ def _slice_take_blocks_ax0( new_mgr_locs=bp, fill_value=fill_value, ) - ] + ], [None] if sl_type == "slice": blknos = self.blknos[slobj] @@ -756,6 +848,7 @@ def _slice_take_blocks_ax0( # When filling blknos, make sure blknos is updated before appending to # blocks list, that way new blkno is exactly len(blocks). blocks = [] + refs: list[weakref.ref | None] = [] group = not only_slice for blkno, mgr_locs in libinternals.get_blkno_placements(blknos, group=group): if blkno == -1: @@ -768,6 +861,7 @@ def _slice_take_blocks_ax0( use_na_proxy=use_na_proxy, ) ) + refs.append(None) else: blk = self.blocks[blkno] @@ -781,18 +875,20 @@ def _slice_take_blocks_ax0( newblk = blk.copy(deep=False) newblk.mgr_locs = BlockPlacement(slice(mgr_loc, mgr_loc + 1)) blocks.append(newblk) + refs.append(weakref.ref(blk)) else: # GH#32779 to avoid the performance penalty of copying, # we may try to only slice taker = blklocs[mgr_locs.indexer] max_len = max(len(mgr_locs), taker.max() + 1) - if only_slice: + if only_slice or _using_copy_on_write(): taker = lib.maybe_indices_to_slice(taker, max_len) if isinstance(taker, slice): nb = blk.getitem_block_columns(taker, new_mgr_locs=mgr_locs) blocks.append(nb) + refs.append(weakref.ref(blk)) elif only_slice: # GH#33597 slice instead of take, so we get # views instead of copies @@ -802,11 +898,13 @@ def _slice_take_blocks_ax0( nb = blk.getitem_block_columns(slc, new_mgr_locs=bp) # We have np.shares_memory(nb.values, blk.values) blocks.append(nb) + refs.append(weakref.ref(blk)) else: nb = blk.take_nd(taker, axis=0, new_mgr_locs=mgr_locs) blocks.append(nb) + refs.append(None) - return blocks + return blocks, refs def _make_na_block( self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False @@ -873,6 +971,7 @@ def take( indexer=indexer, axis=axis, allow_dups=True, + copy=None, ) @@ -890,6 +989,7 @@ def __init__( self, blocks: Sequence[Block], axes: Sequence[Index], + refs: list[weakref.ref | None] | None = None, verify_integrity: bool = True, ) -> None: @@ -939,13 +1039,26 @@ def _verify_integrity(self) -> None: f"block items\n# manager items: {len(self.items)}, # " f"tot_items: {tot_items}" ) + if self.refs is not None: + if len(self.refs) != len(self.blocks): + raise AssertionError( + "Number of passed refs must equal the number of blocks: " + f"{len(self.refs)} refs vs {len(self.blocks)} blocks." + "\nIf you see this error, please report a bug at " + "https://github.com/pandas-dev/pandas/issues" + ) @classmethod - def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> BlockManager: + def from_blocks( + cls, + blocks: list[Block], + axes: list[Index], + refs: list[weakref.ref | None] | None = None, + ) -> BlockManager: """ Constructor for BlockManager and SingleBlockManager with same signature. """ - return cls(blocks, axes, verify_integrity=False) + return cls(blocks, axes, refs, verify_integrity=False) # ---------------------------------------------------------------- # Indexing @@ -965,7 +1078,9 @@ def fast_xs(self, loc: int) -> SingleBlockManager: if len(self.blocks) == 1: result = self.blocks[0].iget((slice(None), loc)) block = new_block(result, placement=slice(0, len(result)), ndim=1) - return SingleBlockManager(block, self.axes[0]) + # in the case of a single block, the new block is a view + ref = weakref.ref(self.blocks[0]) + return SingleBlockManager(block, self.axes[0], [ref]) dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) @@ -996,12 +1111,16 @@ def iget(self, i: int) -> SingleBlockManager: # shortcut for select a single-dim from a 2-dim BM bp = BlockPlacement(slice(0, len(values))) nb = type(block)(values, placement=bp, ndim=1) - return SingleBlockManager(nb, self.axes[1]) + return SingleBlockManager(nb, self.axes[1], [weakref.ref(block)]) def iget_values(self, i: int) -> ArrayLike: """ Return the data for column i as the values (ndarray or ExtensionArray). + + Warning! The returned array is a view but doesn't handle Copy-on-Write, + so this should be used with caution. """ + # TODO(CoW) making the arrays read-only might make this safer to use? block = self.blocks[self.blknos[i]] values = block.iget(self.blklocs[i]) return values @@ -1011,6 +1130,9 @@ def column_arrays(self) -> list[np.ndarray]: """ Used in the JSON C code to access column arrays. This optimizes compared to using `iget_values` by converting each + + Warning! This doesn't handle Copy-on-Write, so should be used with + caution (current use case of consuming this in the JSON code is fine). """ # This is an optimized equivalent to # result = [self.iget_values(i) for i in range(len(self.items))] @@ -1102,7 +1224,12 @@ def value_getitem(placement): blk = self.blocks[blkno_l] blk_locs = blklocs[val_locs.indexer] if inplace and blk.should_store(value): - blk.set_inplace(blk_locs, value_getitem(val_locs)) + # Updating inplace -> check if we need to do Copy-on-Write + if _using_copy_on_write() and not self._has_no_reference_block(blkno_l): + blk.set_inplace(blk_locs, value_getitem(val_locs), copy=True) + self._clear_reference_block(blkno_l) + else: + blk.set_inplace(blk_locs, value_getitem(val_locs)) else: unfit_mgr_locs.append(blk.mgr_locs.as_array[blk_locs]) unfit_val_locs.append(val_locs) @@ -1117,9 +1244,11 @@ def value_getitem(placement): ) self.blocks = blocks_tup self._blklocs[nb.mgr_locs.indexer] = np.arange(len(nb)) + # blk.delete gives a copy, so we can remove a possible reference + self._clear_reference_block(blkno_l) if len(removed_blknos): - # Remove blocks & update blknos accordingly + # Remove blocks & update blknos and refs accordingly is_deleted = np.zeros(self.nblocks, dtype=np.bool_) is_deleted[removed_blknos] = True @@ -1130,6 +1259,12 @@ def value_getitem(placement): self.blocks = tuple( blk for i, blk in enumerate(self.blocks) if i not in set(removed_blknos) ) + if self.refs is not None: + self.refs = [ + ref + for i, ref in enumerate(self.refs) + if i not in set(removed_blknos) + ] if unfit_val_locs: unfit_idxr = np.concatenate(unfit_mgr_locs) @@ -1166,6 +1301,10 @@ def value_getitem(placement): self._blklocs[unfit_idxr] = np.arange(unfit_count) self.blocks += tuple(new_blocks) + # TODO(CoW) is this always correct to assume that the new_blocks + # are not referencing anything else? + if self.refs is not None: + self.refs = list(self.refs) + [None] * len(new_blocks) # Newly created block's dtype may already be present. self._known_consolidated = False @@ -1183,14 +1322,20 @@ def _iset_single( # Caller is responsible for verifying value.shape if inplace and blk.should_store(value): + copy = False + if _using_copy_on_write() and not self._has_no_reference_block(blkno): + # perform Copy-on-Write and clear the reference + copy = True + self._clear_reference_block(blkno) iloc = self.blklocs[loc] - blk.set_inplace(slice(iloc, iloc + 1), value) + blk.set_inplace(slice(iloc, iloc + 1), value, copy=copy) return nb = new_block_2d(value, placement=blk._mgr_locs) old_blocks = self.blocks new_blocks = old_blocks[:blkno] + (nb,) + old_blocks[blkno + 1 :] self.blocks = new_blocks + self._clear_reference_block(blkno) return def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value) -> None: @@ -1200,6 +1345,14 @@ def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value) -> None This is a method on the BlockManager level, to avoid creating an intermediate Series at the DataFrame level (`s = df[loc]; s[idx] = value`) """ + if _using_copy_on_write() and not self._has_no_reference(loc): + # otherwise perform Copy-on-Write and clear the reference + blkno = self.blknos[loc] + blocks = list(self.blocks) + blocks[blkno] = blocks[blkno].copy() + self.blocks = tuple(blocks) + self._clear_reference_block(blkno) + col_mgr = self.iget(loc) new_mgr = col_mgr.setitem((idx,), value) self.iset(loc, new_mgr._block.values, inplace=True) @@ -1239,6 +1392,9 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: self.axes[0] = new_axis self.blocks += (block,) + # TODO(CoW) do we always "own" the passed `value`? + if self.refs is not None: + self.refs += [None] self._known_consolidated = False @@ -1292,10 +1448,10 @@ def idelete(self, indexer) -> BlockManager: is_deleted[indexer] = True taker = (~is_deleted).nonzero()[0] - nbs = self._slice_take_blocks_ax0(taker, only_slice=True) + nbs, new_refs = self._slice_take_blocks_ax0(taker, only_slice=True) new_columns = self.items[~is_deleted] axes = [new_columns, self.axes[1]] - return type(self)(tuple(nbs), axes, verify_integrity=False) + return type(self)(tuple(nbs), axes, new_refs, verify_integrity=False) # ---------------------------------------------------------------- # Block-wise Operation @@ -1550,6 +1706,7 @@ def as_array( ------- arr : ndarray """ + # TODO(CoW) handle case where resulting array is a view if len(self.blocks) == 0: arr = np.empty(self.shape, dtype=float) return arr.transpose() @@ -1674,7 +1831,10 @@ def _consolidate_inplace(self) -> None: # the DataFrame's _item_cache. The exception is for newly-created # BlockManager objects not yet attached to a DataFrame. if not self.is_consolidated(): - self.blocks = tuple(_consolidate(self.blocks)) + if self.refs is None: + self.blocks = _consolidate(self.blocks) + else: + self.blocks, self.refs = _consolidate_with_refs(self.blocks, self.refs) self._is_consolidated = True self._known_consolidated = True self._rebuild_blknos_and_blklocs() @@ -1696,6 +1856,7 @@ def __init__( self, block: Block, axis: Index, + refs: list[weakref.ref | None] | None = None, verify_integrity: bool = False, fastpath=lib.no_default, ) -> None: @@ -1713,15 +1874,23 @@ def __init__( self.axes = [axis] self.blocks = (block,) + self.refs = refs @classmethod - def from_blocks(cls, blocks: list[Block], axes: list[Index]) -> SingleBlockManager: + def from_blocks( + cls, + blocks: list[Block], + axes: list[Index], + refs: list[weakref.ref | None] | None = None, + ) -> SingleBlockManager: """ Constructor for BlockManager and SingleBlockManager with same signature. """ assert len(blocks) == 1 assert len(axes) == 1 - return cls(blocks[0], axes[0], verify_integrity=False) + if refs is not None: + assert len(refs) == 1 + return cls(blocks[0], axes[0], refs, verify_integrity=False) @classmethod def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager: @@ -1740,7 +1909,18 @@ def to_2d_mgr(self, columns: Index) -> BlockManager: bp = BlockPlacement(0) new_blk = type(blk)(arr, placement=bp, ndim=2) axes = [columns, self.axes[0]] - return BlockManager([new_blk], axes=axes, verify_integrity=False) + refs: list[weakref.ref | None] = [weakref.ref(blk)] + return BlockManager([new_blk], axes=axes, refs=refs, verify_integrity=False) + + def _has_no_reference(self, i: int = 0) -> bool: + """ + Check for column `i` if it has references. + (whether it references another array or is itself being referenced) + Returns True if the column has no references. + """ + return (self.refs is None or self.refs[0] is None) and weakref.getweakrefcount( + self.blocks[0] + ) == 0 def __getstate__(self): block_values = [b.values for b in self.blocks] @@ -1810,7 +1990,9 @@ def getitem_mgr(self, indexer: slice | npt.NDArray[np.bool_]) -> SingleBlockMana block = type(blk)(array, placement=bp, ndim=1) new_idx = self.index[indexer] - return type(self)(block, new_idx) + # TODO(CoW) in theory only need to track reference if new_array is a view + ref = weakref.ref(blk) + return type(self)(block, new_idx, [ref]) def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: # Assertion disabled for performance @@ -1823,7 +2005,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: bp = BlockPlacement(slice(0, len(array))) block = type(blk)(array, placement=bp, ndim=1) new_index = self.index._getitem_slice(slobj) - return type(self)(block, new_index) + return type(self)(block, new_index, [weakref.ref(blk)]) @property def index(self) -> Index: @@ -1850,15 +2032,30 @@ def array_values(self): def get_numeric_data(self, copy: bool = False): if self._block.is_numeric: - if copy: - return self.copy() - return self + return self.copy(deep=copy) return self.make_empty() @property def _can_hold_na(self) -> bool: return self._block._can_hold_na + def setitem_inplace(self, indexer, value) -> None: + """ + Set values with indexer. + + For Single[Block/Array]Manager, this backs s[indexer] = value + + This is an inplace version of `setitem()`, mutating the manager/values + in place, not returning a new Manager (and Block), and thus never changing + the dtype. + """ + if _using_copy_on_write() and not self._has_no_reference(0): + self.blocks = (self._block.copy(),) + self.refs = None + self._cache.clear() + + super().setitem_inplace(indexer, value) + def idelete(self, indexer) -> SingleBlockManager: """ Delete single location from SingleBlockManager. @@ -1869,6 +2066,8 @@ def idelete(self, indexer) -> SingleBlockManager: self.blocks = (nb,) self.axes[0] = self.axes[0].delete(indexer) self._cache.clear() + # clear reference since delete always results in a new array + self.refs = None return self def fast_xs(self, loc): @@ -1885,6 +2084,9 @@ def set_values(self, values: ArrayLike): Use at your own risk! This does not check if the passed values are valid for the current Block/SingleBlockManager (length, dtype, etc). """ + # TODO(CoW) do we need to handle copy on write here? Currently this is + # only used for FrameColumnApply.series_generator (what if apply is + # mutating inplace?) self.blocks[0].values = values self.blocks[0]._mgr_locs = BlockPlacement(slice(len(values))) @@ -2068,7 +2270,7 @@ def _stack_arrays(tuples, dtype: np.dtype): return stacked, placement -def _consolidate(blocks: tuple[Block, ...]) -> list[Block]: +def _consolidate(blocks: tuple[Block, ...]) -> tuple[Block, ...]: """ Merge blocks having same dtype, exclude non-consolidating blocks """ @@ -2078,19 +2280,44 @@ def _consolidate(blocks: tuple[Block, ...]) -> list[Block]: new_blocks: list[Block] = [] for (_can_consolidate, dtype), group_blocks in grouper: - merged_blocks = _merge_blocks( + merged_blocks, _ = _merge_blocks( + list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate + ) + new_blocks = extend_blocks(merged_blocks, new_blocks) + return tuple(new_blocks) + + +def _consolidate_with_refs( + blocks: tuple[Block, ...], refs +) -> tuple[tuple[Block, ...], list[weakref.ref | None]]: + """ + Merge blocks having same dtype, exclude non-consolidating blocks, handling + refs + """ + gkey = lambda x: x[0]._consolidate_key + grouper = itertools.groupby(sorted(zip(blocks, refs), key=gkey), gkey) + + new_blocks: list[Block] = [] + new_refs: list[weakref.ref | None] = [] + for (_can_consolidate, dtype), group_blocks_refs in grouper: + group_blocks, group_refs = list(zip(*list(group_blocks_refs))) + merged_blocks, consolidated = _merge_blocks( list(group_blocks), dtype=dtype, can_consolidate=_can_consolidate ) new_blocks = extend_blocks(merged_blocks, new_blocks) - return new_blocks + if consolidated: + new_refs.extend([None]) + else: + new_refs.extend(group_refs) + return tuple(new_blocks), new_refs def _merge_blocks( blocks: list[Block], dtype: DtypeObj, can_consolidate: bool -) -> list[Block]: +) -> tuple[list[Block], bool]: if len(blocks) == 1: - return blocks + return blocks, False if can_consolidate: @@ -2116,10 +2343,10 @@ def _merge_blocks( new_mgr_locs = new_mgr_locs[argsort] bp = BlockPlacement(new_mgr_locs) - return [new_block_2d(new_values, placement=bp)] + return [new_block_2d(new_values, placement=bp)], True # can't consolidate --> no merge - return blocks + return blocks, False def _fast_count_smallints(arr: npt.NDArray[np.intp]): @@ -2152,3 +2379,7 @@ def _preprocess_slice_or_indexer( if not allow_fill: indexer = maybe_convert_indices(indexer, length) return "fancy", indexer, len(indexer) + + +def _using_copy_on_write(): + return get_option("mode.copy_on_write") diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 1160d3b2a8e3a2..5febb302a9de9f 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -36,7 +36,7 @@ def _iter_block_pairs( left_ea = blk_vals.ndim == 1 - rblks = right._slice_take_blocks_ax0(locs.indexer, only_slice=True) + rblks, _ = right._slice_take_blocks_ax0(locs.indexer, only_slice=True) # Assertions are disabled for performance, but should hold: # if left_ea: diff --git a/pandas/core/series.py b/pandas/core/series.py index 13aa12287072ce..e80a798eb18984 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1301,7 +1301,16 @@ def _maybe_update_cacher( # a copy if ref is None: del self._cacher - elif len(self) == len(ref) and self.name in ref.columns: + # for CoW, we never want to update the parent DataFrame cache + # if the Series changed, and always pop the cached item + elif ( + not ( + get_option("mode.copy_on_write") + and get_option("mode.data_manager") == "block" + ) + and len(self) == len(ref) + and self.name in ref.columns + ): # GH#42530 self.name must be in ref.columns # to ensure column still in dataframe # otherwise, either self or ref has swapped in new arrays diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 22b8f9d020a5f8..a0b9a0d2475332 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -341,8 +341,10 @@ def convert_delta_safe(base, deltas, unit) -> Series: has_bad_values = False if bad_locs.any(): has_bad_values = True - data_col = Series(dates) - data_col[bad_locs] = 1.0 # Replace with NaT + # reset cache to avoid SettingWithCopy checks (we own the DataFrame and the + # `dates` Series is used to overwrite itself in the DataFramae) + dates._reset_cacher() + dates[bad_locs] = 1.0 # Replace with NaT dates = dates.astype(np.int64) if fmt.startswith(("%tc", "tc")): # Delta ms relative to base diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index ff3abaf8192064..d2882f46d25bf6 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -1440,7 +1440,7 @@ def test_apply_dtype(col): tm.assert_series_equal(result, expected) -def test_apply_mutating(using_array_manager): +def test_apply_mutating(using_array_manager, using_copy_on_write): # GH#35462 case where applied func pins a new BlockManager to a row df = DataFrame({"a": range(100), "b": range(100, 200)}) df_orig = df.copy() @@ -1457,12 +1457,13 @@ def func(row): result = df.apply(func, axis=1) tm.assert_frame_equal(result, expected) - if not using_array_manager: + if using_copy_on_write or using_array_manager: + # INFO(CoW) With copy on write, mutating a viewing row doesn't mutate the parent # INFO(ArrayManager) With BlockManager, the row is a view and mutated in place, # with ArrayManager the row is not a view, and thus not mutated in place - tm.assert_frame_equal(df, result) - else: tm.assert_frame_equal(df, df_orig) + else: + tm.assert_frame_equal(df, result) def test_apply_empty_list_reduce(): diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 6eb7237c4f41c8..d917a3c79aa972 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -150,7 +150,7 @@ def test_subset_column_slice(using_copy_on_write, using_array_manager, dtype): ids=["slice", "mask", "array"], ) def test_subset_loc_rows_columns( - dtype, row_indexer, column_indexer, using_array_manager + dtype, row_indexer, column_indexer, using_array_manager, using_copy_on_write ): # Case: taking a subset of the rows+columns of a DataFrame using .loc # + afterwards modifying the subset @@ -177,7 +177,7 @@ def test_subset_loc_rows_columns( if ( isinstance(row_indexer, slice) and isinstance(column_indexer, slice) - and (using_array_manager or dtype == "int64") + and (using_array_manager or (dtype == "int64" and not using_copy_on_write)) ): df_orig.iloc[1, 1] = 0 tm.assert_frame_equal(df, df_orig) @@ -197,7 +197,7 @@ def test_subset_loc_rows_columns( ids=["slice", "mask", "array"], ) def test_subset_iloc_rows_columns( - dtype, row_indexer, column_indexer, using_array_manager + dtype, row_indexer, column_indexer, using_array_manager, using_copy_on_write ): # Case: taking a subset of the rows+columns of a DataFrame using .iloc # + afterwards modifying the subset @@ -224,7 +224,7 @@ def test_subset_iloc_rows_columns( if ( isinstance(row_indexer, slice) and isinstance(column_indexer, slice) - and (using_array_manager or dtype == "int64") + and (using_array_manager or (dtype == "int64" and not using_copy_on_write)) ): df_orig.iloc[1, 1] = 0 tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/test_internals.py b/pandas/tests/copy_view/test_internals.py new file mode 100644 index 00000000000000..2191fc1b332181 --- /dev/null +++ b/pandas/tests/copy_view/test_internals.py @@ -0,0 +1,45 @@ +import numpy as np + +import pandas.util._test_decorators as td + +from pandas import DataFrame +from pandas.tests.copy_view.util import get_array + + +@td.skip_array_manager_invalid_test +def test_consolidate(using_copy_on_write): + + # create unconsolidated DataFrame + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + df["c"] = [4, 5, 6] + + # take a viewing subset + subset = df[:] + + # each block of subset references a block of df + assert subset._mgr.refs is not None and all( + ref is not None for ref in subset._mgr.refs + ) + + # consolidate the two int64 blocks + subset._consolidate_inplace() + + # the float64 block still references the parent one because it still a view + assert subset._mgr.refs[0] is not None + # equivalent of assert np.shares_memory(df["b"].values, subset["b"].values) + # but avoids caching df["b"] + assert np.shares_memory(get_array(df, "b"), get_array(subset, "b")) + + # the new consolidated int64 block does not reference another + assert subset._mgr.refs[1] is None + + # the parent dataframe now also only is linked for the float column + assert df._mgr._has_no_reference(0) + assert not df._mgr._has_no_reference(1) + assert df._mgr._has_no_reference(2) + + # and modifying subset still doesn't modify parent + if using_copy_on_write: + subset.iloc[0, 1] = 0.0 + assert df._mgr._has_no_reference(1) + assert df.loc[0, "b"] == 0.1 diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 1ed458e95b78e2..cc4c219e6c5d99 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,6 +1,9 @@ import numpy as np -from pandas import DataFrame +from pandas import ( + DataFrame, + Series, +) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -126,3 +129,47 @@ def test_reindex_columns(using_copy_on_write): if using_copy_on_write: assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) tm.assert_frame_equal(df, df_orig) + + +def test_select_dtypes(using_copy_on_write): + # Case: selecting columns using `select_dtypes()` returns a new dataframe + # + afterwards modifying the result + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.select_dtypes("int64") + df2._mgr._verify_integrity() + + # currently this always returns a "view" + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + + # mutating df2 triggers a copy-on-write for that column/block + df2.iloc[0, 0] = 0 + if using_copy_on_write: + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) + tm.assert_frame_equal(df, df_orig) + else: + # but currently select_dtypes() actually returns a view -> mutates parent + df_orig.iloc[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_to_frame(using_copy_on_write): + # Case: converting a Series to a DataFrame with to_frame + ser = Series([1, 2, 3]) + ser_orig = ser.copy() + + df = ser.to_frame() + + # currently this always returns a "view" + assert np.shares_memory(ser.values, get_array(df, 0)) + + df.iloc[0, 0] = 0 + + if using_copy_on_write: + # mutating df triggers a copy-on-write for that column + assert not np.shares_memory(ser.values, get_array(df, 0)) + tm.assert_series_equal(ser, ser_orig) + else: + # but currently select_dtypes() actually returns a view -> mutates parent + ser_orig.iloc[0] = 0 + tm.assert_series_equal(ser, ser_orig) diff --git a/pandas/tests/copy_view/test_setitem.py b/pandas/tests/copy_view/test_setitem.py index dd429831798066..9e0d350dde0dee 100644 --- a/pandas/tests/copy_view/test_setitem.py +++ b/pandas/tests/copy_view/test_setitem.py @@ -34,8 +34,9 @@ def test_set_column_with_series(using_copy_on_write): df["c"] = ser if using_copy_on_write: - # with CoW we can delay the copy - assert np.shares_memory(df["c"].values, ser.values) + # TODO(CoW) with CoW we can delay the copy + # assert np.shares_memory(df["c"].values, ser.values) + assert not np.shares_memory(df["c"].values, ser.values) else: # the series data is copied assert not np.shares_memory(df["c"].values, ser.values) @@ -78,8 +79,9 @@ def test_set_columns_with_dataframe(using_copy_on_write): df[["c", "d"]] = df2 if using_copy_on_write: - # with CoW we can delay the copy - assert np.shares_memory(df["c"].values, df2["c"].values) + # TODO(CoW) with CoW we can delay the copy + # assert np.shares_memory(df["c"].values, df2["c"].values) + assert not np.shares_memory(df["c"].values, df2["c"].values) else: # the data is copied assert not np.shares_memory(df["c"].values, df2["c"].values) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index bb948f2281c64e..8dbf7d47374a67 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -390,6 +390,7 @@ def test_setitem_frame_2d_values(self, data): # Avoiding using_array_manager fixture # https://github.com/pandas-dev/pandas/pull/44514#discussion_r754002410 using_array_manager = isinstance(df._mgr, pd.core.internals.ArrayManager) + using_copy_on_write = pd.options.mode.copy_on_write blk_data = df._mgr.arrays[0] @@ -414,7 +415,7 @@ def test_setitem_frame_2d_values(self, data): with tm.assert_produces_warning(warn, match=msg): df.iloc[:] = df.values self.assert_frame_equal(df, orig) - if not using_array_manager: + if not using_array_manager and not using_copy_on_write: # GH#33457 Check that this setting occurred in-place # FIXME(ArrayManager): this should work there too assert df._mgr.arrays[0] is blk_data diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 7994c56f8d68b6..a98fa52e1009dd 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -357,12 +357,18 @@ def test_getitem_empty_frame_with_boolean(self): df2 = df[df > 0] tm.assert_frame_equal(df, df2) - def test_getitem_returns_view_when_column_is_unique_in_df(self): + def test_getitem_returns_view_when_column_is_unique_in_df( + self, using_copy_on_write + ): # GH#45316 df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "a", "b"]) + df_orig = df.copy() view = df["b"] view.loc[:] = 100 - expected = DataFrame([[1, 2, 100], [4, 5, 100]], columns=["a", "a", "b"]) + if using_copy_on_write: + expected = df_orig + else: + expected = DataFrame([[1, 2, 100], [4, 5, 100]], columns=["a", "a", "b"]) tm.assert_frame_equal(df, expected) def test_getitem_frozenset_unique_in_column(self): diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 9027ce81098108..6eecf4c18f1820 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -269,7 +269,7 @@ def test_setattr_column(self): df.foobar = 5 assert (df.foobar == 5).all() - def test_setitem(self, float_frame): + def test_setitem(self, float_frame, using_copy_on_write): # not sure what else to do here series = float_frame["A"][::2] float_frame["col5"] = series @@ -305,8 +305,12 @@ def test_setitem(self, float_frame): smaller = float_frame[:2] msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): + if using_copy_on_write: + # With CoW, adding a new column doesn't raise a warning smaller["col10"] = ["1", "2"] + else: + with pytest.raises(SettingWithCopyError, match=msg): + smaller["col10"] = ["1", "2"] assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() @@ -536,22 +540,29 @@ def test_getitem_setitem_integer_slice_keyerrors(self): df2.loc[3:11] = 0 @td.skip_array_manager_invalid_test # already covered in test_iloc_col_slice_view - def test_fancy_getitem_slice_mixed(self, float_frame, float_string_frame): + def test_fancy_getitem_slice_mixed( + self, float_frame, float_string_frame, using_copy_on_write + ): sliced = float_string_frame.iloc[:, -3:] assert sliced["D"].dtype == np.float64 # get view with single block # setting it triggers setting with copy + original = float_frame.copy() sliced = float_frame.iloc[:, -3:] assert np.shares_memory(sliced["C"]._values, float_frame["C"]._values) msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): - sliced.loc[:, "C"] = 4.0 + if not using_copy_on_write: + with pytest.raises(SettingWithCopyError, match=msg): + sliced.loc[:, "C"] = 4.0 - assert (float_frame["C"] == 4).all() + assert (float_frame["C"] == 4).all() + else: + sliced.loc[:, "C"] = 4.0 + tm.assert_frame_equal(float_frame, original) def test_getitem_setitem_non_ix_labels(self): df = tm.makeTimeDataFrame() @@ -994,7 +1005,7 @@ def test_iloc_row(self): expected = df.reindex(df.index[[1, 2, 4, 6]]) tm.assert_frame_equal(result, expected) - def test_iloc_row_slice_view(self, using_array_manager): + def test_iloc_row_slice_view(self, using_array_manager, using_copy_on_write): df = DataFrame(np.random.randn(10, 4), index=range(0, 20, 2)) original = df.copy() @@ -1004,14 +1015,17 @@ def test_iloc_row_slice_view(self, using_array_manager): assert np.shares_memory(df[2], subset[2]) + exp_col = original[2].copy() msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): + if using_copy_on_write: subset.loc[:, 2] = 0.0 + else: + with pytest.raises(SettingWithCopyError, match=msg): + subset.loc[:, 2] = 0.0 - exp_col = original[2].copy() - # TODO(ArrayManager) verify it is expected that the original didn't change - if not using_array_manager: - exp_col._values[4:8] = 0.0 + # TODO(ArrayManager) verify it is expected that the original didn't change + if not using_array_manager: + exp_col._values[4:8] = 0.0 tm.assert_series_equal(df[2], exp_col) def test_iloc_col(self): @@ -1036,14 +1050,13 @@ def test_iloc_col(self): expected = df.reindex(columns=df.columns[[1, 2, 4, 6]]) tm.assert_frame_equal(result, expected) - def test_iloc_col_slice_view(self, using_array_manager): + def test_iloc_col_slice_view(self, using_array_manager, using_copy_on_write): df = DataFrame(np.random.randn(4, 10), columns=range(0, 20, 2)) original = df.copy() subset = df.iloc[:, slice(4, 8)] - if not using_array_manager: + if not using_array_manager and not using_copy_on_write: # verify slice is view - assert np.shares_memory(df[8]._values, subset[8]._values) # and that we are setting a copy @@ -1053,7 +1066,9 @@ def test_iloc_col_slice_view(self, using_array_manager): assert (df[8] == 0).all() else: - # TODO(ArrayManager) verify this is the desired behaviour + if using_copy_on_write: + # verify slice is view + assert np.shares_memory(df[8]._values, subset[8]._values) subset[8] = 0.0 # subset changed assert (subset[8] == 0).all() diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py index 53fcfe334b7707..6d2becd7a32d26 100644 --- a/pandas/tests/frame/indexing/test_setitem.py +++ b/pandas/tests/frame/indexing/test_setitem.py @@ -798,7 +798,7 @@ def test_setitem_object_array_of_tzaware_datetimes(self, idx, expected): class TestDataFrameSetItemWithExpansion: - def test_setitem_listlike_views(self): + def test_setitem_listlike_views(self, using_copy_on_write): # GH#38148 df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]}) @@ -811,7 +811,10 @@ def test_setitem_listlike_views(self): # edit in place the first column to check view semantics df.iloc[0, 0] = 100 - expected = Series([100, 2, 3], name="a") + if using_copy_on_write: + expected = Series([1, 2, 3], name="a") + else: + expected = Series([100, 2, 3], name="a") tm.assert_series_equal(ser, expected) def test_setitem_string_column_numpy_dtype_raising(self): @@ -821,7 +824,7 @@ def test_setitem_string_column_numpy_dtype_raising(self): expected = DataFrame([[1, 2, 5], [3, 4, 6]], columns=[0, 1, "0 - Name"]) tm.assert_frame_equal(df, expected) - def test_setitem_empty_df_duplicate_columns(self): + def test_setitem_empty_df_duplicate_columns(self, using_copy_on_write): # GH#38521 df = DataFrame(columns=["a", "b", "b"], dtype="float64") df.loc[:, "a"] = list(range(2)) @@ -1131,7 +1134,9 @@ def test_setitem_always_copy(self, float_frame): assert notna(s[5:10]).all() @pytest.mark.parametrize("consolidate", [True, False]) - def test_setitem_partial_column_inplace(self, consolidate, using_array_manager): + def test_setitem_partial_column_inplace( + self, consolidate, using_array_manager, using_copy_on_write + ): # This setting should be in-place, regardless of whether frame is # single-block or multi-block # GH#304 this used to be incorrectly not-inplace, in which case @@ -1156,8 +1161,9 @@ def test_setitem_partial_column_inplace(self, consolidate, using_array_manager): tm.assert_series_equal(df["z"], expected) # check setting occurred in-place - tm.assert_numpy_array_equal(zvals, expected.values) - assert np.shares_memory(zvals, df["z"]._values) + if not using_copy_on_write: + tm.assert_numpy_array_equal(zvals, expected.values) + assert np.shares_memory(zvals, df["z"]._values) def test_setitem_duplicate_columns_not_inplace(self): # GH#39510 diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index 898722d6d77aeb..5951c1dd6e45e5 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -111,13 +111,17 @@ def test_xs_keep_level(self): result = df.xs([2008, "sat"], level=["year", "day"], drop_level=False) tm.assert_frame_equal(result, expected) - def test_xs_view(self, using_array_manager): + def test_xs_view(self, using_array_manager, using_copy_on_write): # in 0.14 this will return a view if possible a copy otherwise, but # this is numpy dependent dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5)) + df_orig = dm.copy() - if using_array_manager: + if using_copy_on_write: + dm.xs(2)[:] = 20 + tm.assert_frame_equal(dm, df_orig) + elif using_array_manager: # INFO(ArrayManager) with ArrayManager getting a row as a view is # not possible msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" @@ -176,27 +180,41 @@ def test_xs_level_eq_2(self): result = df.xs("c", level=2) tm.assert_frame_equal(result, expected) - def test_xs_setting_with_copy_error(self, multiindex_dataframe_random_data): + def test_xs_setting_with_copy_error( + self, multiindex_dataframe_random_data, using_copy_on_write + ): # this is a copy in 0.14 df = multiindex_dataframe_random_data + df_orig = df.copy() result = df.xs("two", level="second") - # setting this will give a SettingWithCopyError - # as we are trying to write a view - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): + if using_copy_on_write: result[:] = 10 + else: + # setting this will give a SettingWithCopyError + # as we are trying to write a view + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(SettingWithCopyError, match=msg): + result[:] = 10 + tm.assert_frame_equal(df, df_orig) - def test_xs_setting_with_copy_error_multiple(self, four_level_index_dataframe): + def test_xs_setting_with_copy_error_multiple( + self, four_level_index_dataframe, using_copy_on_write + ): # this is a copy in 0.14 df = four_level_index_dataframe + df_orig = df.copy() result = df.xs(("a", 4), level=["one", "four"]) - # setting this will give a SettingWithCopyError - # as we are trying to write a view - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): + if using_copy_on_write: result[:] = 10 + else: + # setting this will give a SettingWithCopyError + # as we are trying to write a view + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(SettingWithCopyError, match=msg): + result[:] = 10 + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("key, level", [("one", "second"), (["one"], ["second"])]) def test_xs_with_duplicates(self, key, level, multiindex_dataframe_random_data): @@ -359,15 +377,20 @@ def test_xs_droplevel_false(self): expected = DataFrame({"a": [1]}) tm.assert_frame_equal(result, expected) - def test_xs_droplevel_false_view(self, using_array_manager): + def test_xs_droplevel_false_view(self, using_array_manager, using_copy_on_write): # GH#37832 df = DataFrame([[1, 2, 3]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) # check that result still views the same data as df assert np.shares_memory(result.iloc[:, 0]._values, df.iloc[:, 0]._values) - # modifying original df also modifies result when having a single block + df.iloc[0, 0] = 2 - expected = DataFrame({"a": [2]}) + if using_copy_on_write: + # with copy on write the subset is never modified + expected = DataFrame({"a": [1]}) + else: + # modifying original df also modifies result when having a single block + expected = DataFrame({"a": [2]}) tm.assert_frame_equal(result, expected) # with mixed dataframe, modifying the parent doesn't modify result @@ -375,7 +398,10 @@ def test_xs_droplevel_false_view(self, using_array_manager): df = DataFrame([[1, 2.5, "a"]], columns=Index(["a", "b", "c"])) result = df.xs("a", axis=1, drop_level=False) df.iloc[0, 0] = 2 - if using_array_manager: + if using_copy_on_write: + # with copy on write the subset is never modified + expected = DataFrame({"a": [1]}) + elif using_array_manager: # Here the behavior is consistent expected = DataFrame({"a": [2]}) else: diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index d86c1b2aedcacc..20e59ed72666aa 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -20,13 +20,16 @@ class TestFillNA: @td.skip_array_manager_not_yet_implemented - def test_fillna_on_column_view(self): + def test_fillna_on_column_view(self, using_copy_on_write): # GH#46149 avoid unnecessary copies arr = np.full((40, 50), np.nan) df = DataFrame(arr) df[0].fillna(-1, inplace=True) - assert (arr[:, 0] == -1).all() + if using_copy_on_write: + assert np.isnan(arr[:, 0]).all() + else: + assert (arr[:, 0] == -1).all() # i.e. we didn't create a new 49-column block assert len(df._mgr.arrays) == 1 @@ -676,14 +679,18 @@ def test_fillna_inplace_with_columns_limit_and_value(self): @td.skip_array_manager_invalid_test @pytest.mark.parametrize("val", [-1, {"x": -1, "y": -1}]) - def test_inplace_dict_update_view(self, val): + def test_inplace_dict_update_view(self, val, using_copy_on_write): # GH#47188 df = DataFrame({"x": [np.nan, 2], "y": [np.nan, 2]}) + df_orig = df.copy() result_view = df[:] df.fillna(val, inplace=True) expected = DataFrame({"x": [-1, 2.0], "y": [-1.0, 2]}) tm.assert_frame_equal(df, expected) - tm.assert_frame_equal(result_view, expected) + if using_copy_on_write: + tm.assert_frame_equal(result_view, df_orig) + else: + tm.assert_frame_equal(result_view, expected) def test_single_block_df_with_horizontal_axis(self): # GH 47713 diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 98f9d2670074de..7d6cf43c530a76 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -303,7 +303,10 @@ def test_interp_raise_on_all_object_dtype(self): with pytest.raises(TypeError, match=msg): df.interpolate() - def test_interp_inplace(self): + def test_interp_inplace(self, using_copy_on_write): + # TODO(CoW) inplace keyword (it is still mutating the parent) + if using_copy_on_write: + pytest.skip("CoW: inplace keyword not yet handled") df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]}) expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]}) result = df.copy() diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py index b1594660caec62..f4443953a0d525 100644 --- a/pandas/tests/frame/methods/test_rename.py +++ b/pandas/tests/frame/methods/test_rename.py @@ -171,16 +171,21 @@ def test_rename_multiindex(self): tm.assert_index_equal(renamed.index, new_index) @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) setitem copy/view - def test_rename_nocopy(self, float_frame): + def test_rename_nocopy(self, float_frame, using_copy_on_write): renamed = float_frame.rename(columns={"C": "foo"}, copy=False) assert np.shares_memory(renamed["foo"]._values, float_frame["C"]._values) - with tm.assert_produces_warning(None): + # TODO(CoW) this also shouldn't warn in case of CoW, but the heuristic + # checking if the array shares memory doesn't work if CoW happened + with tm.assert_produces_warning(FutureWarning if using_copy_on_write else None): # This loc setitem already happens inplace, so no warning # that this will change in the future renamed.loc[:, "foo"] = 1.0 - assert (float_frame["C"] == 1.0).all() + if using_copy_on_write: + assert not (float_frame["C"] == 1.0).all() + else: + assert (float_frame["C"] == 1.0).all() def test_rename_inplace(self, float_frame): float_frame.rename(columns={"C": "foo"}) diff --git a/pandas/tests/frame/methods/test_to_dict_of_blocks.py b/pandas/tests/frame/methods/test_to_dict_of_blocks.py index c81bed9d93cc4e..eb9b78610a1127 100644 --- a/pandas/tests/frame/methods/test_to_dict_of_blocks.py +++ b/pandas/tests/frame/methods/test_to_dict_of_blocks.py @@ -27,7 +27,7 @@ def test_copy_blocks(self, float_frame): # make sure we did not change the original DataFrame assert not _df[column].equals(df[column]) - def test_no_copy_blocks(self, float_frame): + def test_no_copy_blocks(self, float_frame, using_copy_on_write): # GH#9607 df = DataFrame(float_frame, copy=True) column = df.columns[0] @@ -38,8 +38,11 @@ def test_no_copy_blocks(self, float_frame): if column in _df: _df.loc[:, column] = _df[column] + 1 - # make sure we did change the original DataFrame - assert _df[column].equals(df[column]) + if not using_copy_on_write: + # make sure we did change the original DataFrame + assert _df[column].equals(df[column]) + else: + assert not _df[column].equals(df[column]) def test_to_dict_of_blocks_item_cache(): diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index d3257ac09a0ab0..a35530100a4253 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -140,22 +140,29 @@ def test_update_datetime_tz(self): expected = DataFrame([pd.Timestamp("2019", tz="UTC")]) tm.assert_frame_equal(result, expected) - def test_update_with_different_dtype(self): + def test_update_with_different_dtype(self, using_copy_on_write): # GH#3217 df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) df["c"] = np.nan - df["c"].update(Series(["foo"], index=[0])) + if using_copy_on_write: + df.update({"c": Series(["foo"], index=[0])}) + else: + df["c"].update(Series(["foo"], index=[0])) expected = DataFrame({"a": [1, 3], "b": [np.nan, 2], "c": ["foo", np.nan]}) tm.assert_frame_equal(df, expected) @td.skip_array_manager_invalid_test - def test_update_modify_view(self): + def test_update_modify_view(self, using_copy_on_write): # GH#47188 df = DataFrame({"A": ["1", np.nan], "B": ["100", np.nan]}) df2 = DataFrame({"A": ["a", "x"], "B": ["100", "200"]}) + df2_orig = df2.copy() result_view = df2[:] df2.update(df) expected = DataFrame({"A": ["1", "x"], "B": ["100", "200"]}) tm.assert_frame_equal(df2, expected) - tm.assert_frame_equal(result_view, expected) + if using_copy_on_write: + tm.assert_frame_equal(result_view, df2_orig) + else: + tm.assert_frame_equal(result_view, expected) diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py index af092d433a8462..cb97e2bfb6202a 100644 --- a/pandas/tests/frame/test_api.py +++ b/pandas/tests/frame/test_api.py @@ -323,7 +323,9 @@ def test_attrs(self): assert result.attrs == {"version": 1} @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None]) - def test_set_flags(self, allows_duplicate_labels, frame_or_series): + def test_set_flags( + self, allows_duplicate_labels, frame_or_series, using_copy_on_write + ): obj = DataFrame({"A": [1, 2]}) key = (0, 0) if frame_or_series is Series: @@ -345,15 +347,25 @@ def test_set_flags(self, allows_duplicate_labels, frame_or_series): assert obj.flags.allows_duplicate_labels is True # But we didn't copy data + if frame_or_series is Series: + assert np.may_share_memory(obj.values, result.values) + else: + assert np.may_share_memory(obj["A"].values, result["A"].values) + result.iloc[key] = 0 - assert obj.iloc[key] == 0 + if using_copy_on_write: + assert obj.iloc[key] == 1 + else: + assert obj.iloc[key] == 0 + # set back to 1 for test below + result.iloc[key] = 1 # Now we do copy. result = obj.set_flags( copy=True, allows_duplicate_labels=allows_duplicate_labels ) result.iloc[key] = 10 - assert obj.iloc[key] == 0 + assert obj.iloc[key] == 1 def test_constructor_expanddim(self): # GH#33628 accessing _constructor_expanddim should not raise NotImplementedError diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py index 8aa0e980b01c48..46c712cf4d4585 100644 --- a/pandas/tests/frame/test_block_internals.py +++ b/pandas/tests/frame/test_block_internals.py @@ -334,7 +334,7 @@ def test_is_mixed_type(self, float_frame, float_string_frame): assert not float_frame._is_mixed_type assert float_string_frame._is_mixed_type - def test_stale_cached_series_bug_473(self): + def test_stale_cached_series_bug_473(self, using_copy_on_write): # this is chained, but ok with option_context("chained_assignment", None): @@ -349,9 +349,12 @@ def test_stale_cached_series_bug_473(self): repr(Y) result = Y.sum() # noqa exp = Y["g"].sum() # noqa - assert pd.isna(Y["g"]["c"]) + if using_copy_on_write: + assert not pd.isna(Y["g"]["c"]) + else: + assert pd.isna(Y["g"]["c"]) - def test_strange_column_corruption_issue(self): + def test_strange_column_corruption_issue(self, using_copy_on_write): # TODO(wesm): Unclear how exactly this is related to internal matters df = DataFrame(index=[0, 1]) df[0] = np.nan @@ -363,7 +366,10 @@ def test_strange_column_corruption_issue(self): if col not in wasCol: wasCol[col] = 1 df[col] = np.nan - df[col][dt] = i + if using_copy_on_write: + df.loc[dt, col] = i + else: + df[col][dt] = i myid = 100 @@ -396,7 +402,7 @@ def test_add_column_with_pandas_array(self): tm.assert_frame_equal(df, df2) -def test_update_inplace_sets_valid_block_values(): +def test_update_inplace_sets_valid_block_values(using_copy_on_write): # https://github.com/pandas-dev/pandas/issues/33457 df = DataFrame({"a": Series([1, 2, None], dtype="category")}) @@ -406,8 +412,9 @@ def test_update_inplace_sets_valid_block_values(): # check we haven't put a Series into any block.values assert isinstance(df._mgr.blocks[0].values, Categorical) - # smoketest for OP bug from GH#35731 - assert df.isnull().sum().sum() == 0 + if not using_copy_on_write: + # smoketest for OP bug from GH#35731 + assert df.isnull().sum().sum() == 0 def test_nonconsolidated_item_cache_take(): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 7d3af7dfa9a42c..b4f027f3a832a0 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -271,15 +271,22 @@ def test_constructor_dtype_copy(self): new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 - def test_constructor_dtype_nocast_view_dataframe(self): + def test_constructor_dtype_nocast_view_dataframe(self, using_copy_on_write): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) - should_be_view[0][0] = 99 - assert df.values[0, 0] == 99 + if using_copy_on_write: + # INFO(CoW) doesn't mutate original + should_be_view.iloc[0, 0] = 99 + assert df.values[0, 0] == 1 + else: + should_be_view[0][0] = 99 + assert df.values[0, 0] == 99 - def test_constructor_dtype_nocast_view_2d_array(self, using_array_manager): + def test_constructor_dtype_nocast_view_2d_array( + self, using_array_manager, using_copy_on_write + ): df = DataFrame([[1, 2], [3, 4]], dtype="int64") - if not using_array_manager: + if not using_array_manager and not using_copy_on_write: should_be_view = DataFrame(df.values, dtype=df[0].dtype) should_be_view[0][0] = 97 assert df.values[0, 0] == 97 @@ -2520,7 +2527,13 @@ def test_constructor_list_str_na(self, string_dtype): @pytest.mark.parametrize("copy", [False, True]) def test_dict_nocopy( - self, request, copy, any_numeric_ea_dtype, any_numpy_dtype, using_array_manager + self, + request, + copy, + any_numeric_ea_dtype, + any_numpy_dtype, + using_array_manager, + using_copy_on_write, ): if ( using_array_manager @@ -2594,7 +2607,7 @@ def check_views(c_only: bool = False): with tm.assert_produces_warning(FutureWarning, match="will attempt to set"): df.iloc[:, 2] = pd.array([45, 46], dtype=c.dtype) assert df.dtypes.iloc[2] == c.dtype - if not copy: + if not copy and not using_copy_on_write: check_views(True) if copy: @@ -2606,7 +2619,7 @@ def check_views(c_only: bool = False): assert b[0] == b.dtype.type(3) # FIXME(GH#35417): enable after GH#35417 assert c[0] == c_orig[0] # i.e. df.iloc[0, 2]=45 did *not* update c - else: + elif not using_copy_on_write: # TODO: we can call check_views if we stop consolidating # in setitem_with_indexer assert c[0] == 45 # i.e. df.iloc[0, 2]=45 *did* update c diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index 72e7e458b4e1f6..2cf1cf0f156524 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -12,16 +12,20 @@ class TestPeriodIndex: - def test_getitem_periodindex_duplicates_string_slice(self): + def test_getitem_periodindex_duplicates_string_slice(self, using_copy_on_write): # monotonic idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") ts = Series(np.random.randn(len(idx)), index=idx) + original = ts.copy() result = ts["2007"] expected = ts[1:3] tm.assert_series_equal(result, expected) result[:] = 1 - assert (ts[1:3] == 1).all() + if using_copy_on_write: + tm.assert_series_equal(ts, original) + else: + assert (ts[1:3] == 1).all() # not monotonic idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN") diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index 479cd9952f75b5..2efb288a73f8d2 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -12,7 +12,7 @@ import pandas._testing as tm -def test_detect_chained_assignment(): +def test_detect_chained_assignment(using_copy_on_write): # Inplace ops, originally from: # https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug a = [12, 23] @@ -29,17 +29,21 @@ def test_detect_chained_assignment(): multiind = MultiIndex.from_tuples(tuples, names=["part", "side"]) zed = DataFrame(events, index=["a", "b"], columns=multiind) - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): + if using_copy_on_write: zed["eyes"]["right"].fillna(value=555, inplace=True) + else: + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(SettingWithCopyError, match=msg): + zed["eyes"]["right"].fillna(value=555, inplace=True) @td.skip_array_manager_invalid_test # with ArrayManager df.loc[0] is not a view -def test_cache_updating(): +def test_cache_updating(using_copy_on_write): # 5216 # make sure that we don't try to set a dead cache a = np.random.rand(10, 3) df = DataFrame(a, columns=["x", "y", "z"]) + df_original = df.copy() tuples = [(i, j) for i in range(5) for j in range(2)] index = MultiIndex.from_tuples(tuples) df.index = index @@ -48,7 +52,10 @@ def test_cache_updating(): # but actually works, since everything is a view df.loc[0]["z"].iloc[0] = 1.0 result = df.loc[(0, 0), "z"] - assert result == 1 + if using_copy_on_write: + assert result == df_original.loc[0, "z"] + else: + assert result == 1 # correct setting df.loc[(0, 0), "z"] = 2 diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py index a57b363c0a4482..cface630c66476 100644 --- a/pandas/tests/indexing/multiindex/test_partial.py +++ b/pandas/tests/indexing/multiindex/test_partial.py @@ -122,7 +122,9 @@ def test_getitem_partial_column_select(self): # TODO(ArrayManager) rewrite test to not use .values # exp.loc[2000, 4].values[:] select multiple columns -> .values is not a view @td.skip_array_manager_invalid_test - def test_partial_set(self, multiindex_year_month_day_dataframe_random_data): + def test_partial_set( + self, multiindex_year_month_day_dataframe_random_data, using_copy_on_write + ): # GH #397 ymd = multiindex_year_month_day_dataframe_random_data df = ymd.copy() @@ -132,7 +134,8 @@ def test_partial_set(self, multiindex_year_month_day_dataframe_random_data): tm.assert_frame_equal(df, exp) df["A"].loc[2000, 4] = 1 - exp["A"].loc[2000, 4].values[:] = 1 + if not using_copy_on_write: + exp["A"].loc[2000, 4].values[:] = 1 tm.assert_frame_equal(df, exp) df.loc[2000] = 5 @@ -141,7 +144,10 @@ def test_partial_set(self, multiindex_year_month_day_dataframe_random_data): # this works...for now df["A"].iloc[14] = 5 - assert df["A"].iloc[14] == 5 + if using_copy_on_write: + df["A"].iloc[14] == exp["A"].iloc[14] + else: + assert df["A"].iloc[14] == 5 @pytest.mark.parametrize("dtype", [int, float]) def test_getitem_intkey_leading_level( diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 20569061cfa4cb..ac10a6d82dc89c 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -196,7 +196,7 @@ def test_multiindex_assignment(self): df.loc[4, "d"] = arr tm.assert_series_equal(df.loc[4, "d"], Series(arr, index=[8, 10], name="d")) - def test_multiindex_assignment_single_dtype(self, using_array_manager): + def test_multiindex_assignment_single_dtype(self, using_copy_on_write): # GH3777 part 2b # single dtype arr = np.array([0.0, 1.0]) @@ -216,7 +216,8 @@ def test_multiindex_assignment_single_dtype(self, using_array_manager): tm.assert_series_equal(result, exp) # extra check for inplace-ness - tm.assert_numpy_array_equal(view, exp.values) + if not using_copy_on_write: + tm.assert_numpy_array_equal(view, exp.values) # arr + 0.5 cannot be cast losslessly to int, so we upcast df.loc[4, "c"] = arr + 0.5 @@ -405,16 +406,23 @@ def test_setitem_change_dtype(self, multiindex_dataframe_random_data): reindexed = dft.reindex(columns=[("foo", "two")]) tm.assert_series_equal(reindexed["foo", "two"], s > s.median()) - def test_set_column_scalar_with_loc(self, multiindex_dataframe_random_data): + def test_set_column_scalar_with_loc( + self, multiindex_dataframe_random_data, using_copy_on_write + ): frame = multiindex_dataframe_random_data subset = frame.index[[1, 4, 5]] frame.loc[subset] = 99 assert (frame.loc[subset].values == 99).all() + frame_original = frame.copy() col = frame["B"] col[subset] = 97 - assert (frame.loc[subset, "B"] == 97).all() + if using_copy_on_write: + # chained setitem doesn't work with CoW + tm.assert_frame_equal(frame, frame_original) + else: + assert (frame.loc[subset, "B"] == 97).all() def test_nonunique_assignment_1750(self): df = DataFrame( @@ -487,21 +495,32 @@ def test_frame_setitem_view_direct(multiindex_dataframe_random_data): assert (df["foo"].values == 0).all() -def test_frame_setitem_copy_raises(multiindex_dataframe_random_data): +def test_frame_setitem_copy_raises( + multiindex_dataframe_random_data, using_copy_on_write +): # will raise/warn as its chained assignment df = multiindex_dataframe_random_data.T - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): + if using_copy_on_write: + # TODO(CoW) it would be nice if this could still warn/raise df["foo"]["one"] = 2 + else: + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(SettingWithCopyError, match=msg): + df["foo"]["one"] = 2 -def test_frame_setitem_copy_no_write(multiindex_dataframe_random_data): +def test_frame_setitem_copy_no_write( + multiindex_dataframe_random_data, using_copy_on_write +): frame = multiindex_dataframe_random_data.T expected = frame df = frame.copy() - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(SettingWithCopyError, match=msg): + if using_copy_on_write: df["foo"]["one"] = 2 + else: + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(SettingWithCopyError, match=msg): + df["foo"]["one"] = 2 result = df tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index adc001695579cb..81914e1b8052f5 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -32,7 +32,7 @@ def random_text(nobs=100): class TestCaching: - def test_slice_consolidate_invalidate_item_cache(self): + def test_slice_consolidate_invalidate_item_cache(self, using_copy_on_write): # this is chained assignment, but will 'work' with option_context("chained_assignment", None): @@ -52,7 +52,11 @@ def test_slice_consolidate_invalidate_item_cache(self): # Assignment to wrong series df["bb"].iloc[0] = 0.17 df._clear_item_cache() - tm.assert_almost_equal(df["bb"][0], 0.17) + if not using_copy_on_write: + tm.assert_almost_equal(df["bb"][0], 0.17) + else: + # with ArrayManager, parent is not mutated with chained assignment + tm.assert_almost_equal(df["bb"][0], 2.2) @pytest.mark.parametrize("do_ref", [True, False]) def test_setitem_cache_updating(self, do_ref): @@ -71,7 +75,7 @@ def test_setitem_cache_updating(self, do_ref): assert df.loc[0, "c"] == 0.0 assert df.loc[7, "c"] == 1.0 - def test_setitem_cache_updating_slices(self): + def test_setitem_cache_updating_slices(self, using_copy_on_write): # GH 7084 # not updating cache on series setting with slices expected = DataFrame( @@ -92,12 +96,17 @@ def test_setitem_cache_updating_slices(self): # try via a chain indexing # this actually works out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) + out_original = out.copy() for ix, row in df.iterrows(): v = out[row["C"]][six:eix] + row["D"] out[row["C"]][six:eix] = v - tm.assert_frame_equal(out, expected) - tm.assert_series_equal(out["A"], expected["A"]) + if not using_copy_on_write: + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out["A"], expected["A"]) + else: + tm.assert_frame_equal(out, out_original) + tm.assert_series_equal(out["A"], out_original["A"]) out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) for ix, row in df.iterrows(): @@ -123,7 +132,7 @@ def test_altering_series_clears_parent_cache(self): class TestChaining: - def test_setitem_chained_setfault(self): + def test_setitem_chained_setfault(self, using_copy_on_write): # GH6026 data = ["right", "left", "left", "left", "right", "left", "timeout"] @@ -132,24 +141,38 @@ def test_setitem_chained_setfault(self): df = DataFrame({"response": np.array(data)}) mask = df.response == "timeout" df.response[mask] = "none" - tm.assert_frame_equal(df, DataFrame({"response": mdata})) + if using_copy_on_write: + tm.assert_frame_equal(df, DataFrame({"response": data})) + else: + tm.assert_frame_equal(df, DataFrame({"response": mdata})) recarray = np.rec.fromarrays([data], names=["response"]) df = DataFrame(recarray) mask = df.response == "timeout" df.response[mask] = "none" - tm.assert_frame_equal(df, DataFrame({"response": mdata})) + if using_copy_on_write: + tm.assert_frame_equal(df, DataFrame({"response": data})) + else: + tm.assert_frame_equal(df, DataFrame({"response": mdata})) df = DataFrame({"response": data, "response1": data}) + df_original = df.copy() mask = df.response == "timeout" df.response[mask] = "none" - tm.assert_frame_equal(df, DataFrame({"response": mdata, "response1": data})) + if using_copy_on_write: + tm.assert_frame_equal(df, df_original) + else: + tm.assert_frame_equal(df, DataFrame({"response": mdata, "response1": data})) # GH 6056 expected = DataFrame({"A": [np.nan, "bar", "bah", "foo", "bar"]}) df = DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])}) df["A"].iloc[0] = np.nan result = df.head() + if using_copy_on_write: + expected = DataFrame({"A": ["foo", "bar", "bah", "foo", "bar"]}) + else: + expected = DataFrame({"A": [np.nan, "bar", "bah", "foo", "bar"]}) tm.assert_frame_equal(result, expected) df = DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])}) @@ -158,7 +181,7 @@ def test_setitem_chained_setfault(self): tm.assert_frame_equal(result, expected) @pytest.mark.arm_slow - def test_detect_chained_assignment(self): + def test_detect_chained_assignment(self, using_copy_on_write): with option_context("chained_assignment", "raise"): # work with the chain @@ -166,14 +189,20 @@ def test_detect_chained_assignment(self): df = DataFrame( np.arange(4).reshape(2, 2), columns=list("AB"), dtype="int64" ) + df_original = df.copy() assert df._is_copy is None df["A"][0] = -5 df["A"][1] = -6 - tm.assert_frame_equal(df, expected) + if using_copy_on_write: + tm.assert_frame_equal(df, df_original) + else: + tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow - def test_detect_chained_assignment_raises(self, using_array_manager): + def test_detect_chained_assignment_raises( + self, using_array_manager, using_copy_on_write + ): # test with the chaining df = DataFrame( @@ -182,9 +211,14 @@ def test_detect_chained_assignment_raises(self, using_array_manager): "B": np.array(np.arange(2, 4), dtype=np.float64), } ) + df_original = df.copy() assert df._is_copy is None - if not using_array_manager: + if using_copy_on_write: + df["A"][0] = -5 + df["A"][1] = -6 + tm.assert_frame_equal(df, df_original) + elif not using_array_manager: with pytest.raises(SettingWithCopyError, match=msg): df["A"][0] = -5 @@ -192,7 +226,6 @@ def test_detect_chained_assignment_raises(self, using_array_manager): df["A"][1] = np.nan assert df["A"]._is_copy is None - else: # INFO(ArrayManager) for ArrayManager it doesn't matter that it's # a mixed dataframe @@ -203,7 +236,7 @@ def test_detect_chained_assignment_raises(self, using_array_manager): tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow - def test_detect_chained_assignment_fails(self): + def test_detect_chained_assignment_fails(self, using_copy_on_write): # Using a copy (the chain), fails df = DataFrame( @@ -213,11 +246,15 @@ def test_detect_chained_assignment_fails(self): } ) - with pytest.raises(SettingWithCopyError, match=msg): + if using_copy_on_write: + # TODO(CoW) can we still warn here? df.loc[0]["A"] = -5 + else: + with pytest.raises(SettingWithCopyError, match=msg): + df.loc[0]["A"] = -5 @pytest.mark.arm_slow - def test_detect_chained_assignment_doc_example(self): + def test_detect_chained_assignment_doc_example(self, using_copy_on_write): # Doc example df = DataFrame( @@ -228,30 +265,43 @@ def test_detect_chained_assignment_doc_example(self): ) assert df._is_copy is None - with pytest.raises(SettingWithCopyError, match=msg): + if using_copy_on_write: + # TODO(CoW) can we still warn here? indexer = df.a.str.startswith("o") df[indexer]["c"] = 42 + else: + with pytest.raises(SettingWithCopyError, match=msg): + indexer = df.a.str.startswith("o") + df[indexer]["c"] = 42 @pytest.mark.arm_slow - def test_detect_chained_assignment_object_dtype(self, using_array_manager): + def test_detect_chained_assignment_object_dtype( + self, using_array_manager, using_copy_on_write + ): expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) + df_original = df.copy() - with pytest.raises(SettingWithCopyError, match=msg): - df.loc[0]["A"] = 111 + if not using_copy_on_write: + with pytest.raises(SettingWithCopyError, match=msg): + df.loc[0]["A"] = 111 - if not using_array_manager: + if using_copy_on_write: + # TODO(CoW) can we still warn here? + df["A"][0] = 111 + tm.assert_frame_equal(df, df_original) + elif not using_array_manager: with pytest.raises(SettingWithCopyError, match=msg): df["A"][0] = 111 df.loc[0, "A"] = 111 + tm.assert_frame_equal(df, expected) else: # INFO(ArrayManager) for ArrayManager it doesn't matter that it's # a mixed dataframe df["A"][0] = 111 - - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow def test_detect_chained_assignment_is_copy_pickle(self): @@ -299,8 +349,9 @@ def test_detect_chained_assignment_implicit_take(self): df["letters"] = df["letters"].apply(str.lower) @pytest.mark.arm_slow - def test_detect_chained_assignment_implicit_take2(self): - + def test_detect_chained_assignment_implicit_take2(self, using_copy_on_write): + if using_copy_on_write: + pytest.skip("_is_copy is not always set for CoW") # Implicitly take 2 df = random_text(100000) indexer = df.letters.apply(lambda x: len(x) > 10) @@ -356,18 +407,26 @@ def test_detect_chained_assignment_false_positives(self): str(df) @pytest.mark.arm_slow - def test_detect_chained_assignment_undefined_column(self): + def test_detect_chained_assignment_undefined_column(self, using_copy_on_write): # from SO: # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc df = DataFrame(np.arange(0, 9), columns=["count"]) df["group"] = "b" + df_original = df.copy() - with pytest.raises(SettingWithCopyError, match=msg): + if using_copy_on_write: + # TODO(CoW) can we still warn here? df.iloc[0:5]["group"] = "a" + tm.assert_frame_equal(df, df_original) + else: + with pytest.raises(SettingWithCopyError, match=msg): + df.iloc[0:5]["group"] = "a" @pytest.mark.arm_slow - def test_detect_chained_assignment_changing_dtype(self, using_array_manager): + def test_detect_chained_assignment_changing_dtype( + self, using_array_manager, using_copy_on_write + ): # Mixed type setting but same dtype & changing dtype df = DataFrame( @@ -378,32 +437,45 @@ def test_detect_chained_assignment_changing_dtype(self, using_array_manager): "D": ["a", "b", "c", "d", "e"], } ) + df_original = df.copy() - with pytest.raises(SettingWithCopyError, match=msg): + if using_copy_on_write: df.loc[2]["D"] = "foo" - - with pytest.raises(SettingWithCopyError, match=msg): df.loc[2]["C"] = "foo" + df["C"][2] = "foo" + tm.assert_frame_equal(df, df_original) + + if not using_copy_on_write: + with pytest.raises(SettingWithCopyError, match=msg): + df.loc[2]["D"] = "foo" - if not using_array_manager: with pytest.raises(SettingWithCopyError, match=msg): + df.loc[2]["C"] = "foo" + + if not using_array_manager: + with pytest.raises(SettingWithCopyError, match=msg): + df["C"][2] = "foo" + else: + # INFO(ArrayManager) for ArrayManager it doesn't matter if it's + # changing the dtype or not df["C"][2] = "foo" - else: - # INFO(ArrayManager) for ArrayManager it doesn't matter if it's - # changing the dtype or not - df["C"][2] = "foo" - assert df.loc[2, "C"] == "foo" + assert df.loc[2, "C"] == "foo" - def test_setting_with_copy_bug(self): + def test_setting_with_copy_bug(self, using_copy_on_write): # operating on a copy df = DataFrame( {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} ) + df_original = df.copy() mask = pd.isna(df.c) - with pytest.raises(SettingWithCopyError, match=msg): + if using_copy_on_write: df[["c"]][mask] = df[["b"]][mask] + tm.assert_frame_equal(df, df_original) + else: + with pytest.raises(SettingWithCopyError, match=msg): + df[["c"]][mask] = df[["b"]][mask] def test_setting_with_copy_bug_no_warning(self): # invalid warning as we are returning a new object @@ -414,8 +486,12 @@ def test_setting_with_copy_bug_no_warning(self): # this should not raise df2["y"] = ["g", "h", "i"] - def test_detect_chained_assignment_warnings_errors(self): + def test_detect_chained_assignment_warnings_errors(self, using_copy_on_write): df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) + if using_copy_on_write: + df.loc[0]["A"] = 111 + return + with option_context("chained_assignment", "warn"): with tm.assert_produces_warning(SettingWithCopyWarning): df.loc[0]["A"] = 111 @@ -425,14 +501,23 @@ def test_detect_chained_assignment_warnings_errors(self): df.loc[0]["A"] = 111 @pytest.mark.parametrize("rhs", [3, DataFrame({0: [1, 2, 3, 4]})]) - def test_detect_chained_assignment_warning_stacklevel(self, rhs): + def test_detect_chained_assignment_warning_stacklevel( + self, rhs, using_copy_on_write + ): # GH#42570 df = DataFrame(np.arange(25).reshape(5, 5)) + df_original = df.copy() chained = df.loc[:3] with option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(SettingWithCopyWarning) as t: - chained[2] = rhs - assert t[0].filename == __file__ + if not using_copy_on_write: + with tm.assert_produces_warning(SettingWithCopyWarning) as t: + chained[2] = rhs + assert t[0].filename == __file__ + else: + # INFO(CoW) no warning, and original dataframe not changed + with tm.assert_produces_warning(None): + chained[2] = rhs + tm.assert_frame_equal(df, df_original) # TODO(ArrayManager) fast_xs with array-like scalars is not yet working @td.skip_array_manager_not_yet_implemented @@ -483,7 +568,7 @@ def test_cache_updating2(self): expected = Series([0, 0, 0, 2, 0], name="f") tm.assert_series_equal(df.f, expected) - def test_iloc_setitem_chained_assignment(self): + def test_iloc_setitem_chained_assignment(self, using_copy_on_write): # GH#3970 with option_context("chained_assignment", None): df = DataFrame({"aa": range(5), "bb": [2.2] * 5}) @@ -497,7 +582,10 @@ def test_iloc_setitem_chained_assignment(self): df.iloc[ck] df["bb"].iloc[0] = 0.15 - assert df["bb"].iloc[0] == 0.15 + if not using_copy_on_write: + assert df["bb"].iloc[0] == 0.15 + else: + assert df["bb"].iloc[0] == 2.2 def test_getitem_loc_assignment_slice_state(self): # GH 13569 diff --git a/pandas/tests/indexing/test_iat.py b/pandas/tests/indexing/test_iat.py index 44bd51ee1b7d1a..916303884df889 100644 --- a/pandas/tests/indexing/test_iat.py +++ b/pandas/tests/indexing/test_iat.py @@ -31,7 +31,7 @@ def test_iat_getitem_series_with_period_index(): assert expected == result -def test_iat_setitem_item_cache_cleared(indexer_ial): +def test_iat_setitem_item_cache_cleared(indexer_ial, using_copy_on_write): # GH#45684 data = {"x": np.arange(8, dtype=np.int64), "y": np.int64(0)} df = DataFrame(data).copy() @@ -44,5 +44,6 @@ def test_iat_setitem_item_cache_cleared(indexer_ial): indexer_ial(df)[7, 1] = 1234 assert df.iat[7, 1] == 1234 - assert ser.iloc[-1] == 1234 + if not using_copy_on_write: + assert ser.iloc[-1] == 1234 assert df.iloc[-1, -1] == 1234 diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index fdf741040407f9..8cc6b6e73aaea5 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -123,7 +123,7 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, box): if frame_or_series is Series: values = obj.values else: - values = obj[0].values + values = obj._mgr.arrays[0] if frame_or_series is Series: obj.iloc[:2] = box(arr[2:]) @@ -843,7 +843,9 @@ def test_iloc_empty_list_indexer_is_ok(self): df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object(self, using_array_manager, request): + def test_identity_slice_returns_new_object( + self, using_array_manager, using_copy_on_write, request + ): # GH13873 if using_array_manager: mark = pytest.mark.xfail( @@ -859,8 +861,12 @@ def test_identity_slice_returns_new_object(self, using_array_manager, request): assert np.shares_memory(original_df["a"], sliced_df["a"]) # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig + # depending on CoW original_df.loc[:, "a"] = [4, 4, 4] - assert (sliced_df["a"] == 4).all() + if using_copy_on_write: + assert (sliced_df["a"] == [1, 2, 3]).all() + else: + assert (sliced_df["a"] == 4).all() original_series = Series([1, 2, 3, 4, 5, 6]) sliced_series = original_series.iloc[:] @@ -868,7 +874,11 @@ def test_identity_slice_returns_new_object(self, using_array_manager, request): # should also be a shallow copy original_series[:3] = [7, 8, 9] - assert all(sliced_series[:3] == [7, 8, 9]) + if using_copy_on_write: + # shallow copy not updated (CoW) + assert all(sliced_series[:3] == [1, 2, 3]) + else: + assert all(sliced_series[:3] == [7, 8, 9]) def test_indexing_zerodim_np_array(self): # GH24919 @@ -884,9 +894,10 @@ def test_series_indexing_zerodim_np_array(self): assert result == 1 @td.skip_array_manager_not_yet_implemented - def test_iloc_setitem_categorical_updates_inplace(self): + def test_iloc_setitem_categorical_updates_inplace(self, using_copy_on_write): # Mixed dtype ensures we go through take_split_path in setitem_with_indexer cat = Categorical(["A", "B", "C"]) + cat_original = cat.copy() df = DataFrame({1: cat, 2: [1, 2, 3]}, copy=False) assert tm.shares_memory(df[1], cat) @@ -895,9 +906,13 @@ def test_iloc_setitem_categorical_updates_inplace(self): msg = "will attempt to set the values inplace instead" with tm.assert_produces_warning(FutureWarning, match=msg): df.iloc[:, 0] = cat[::-1] - assert tm.shares_memory(df[1], cat) - expected = Categorical(["C", "B", "A"], categories=["A", "B", "C"]) + if not using_copy_on_write: + assert tm.shares_memory(df[1], cat) + expected = Categorical(["C", "B", "A"], categories=["A", "B", "C"]) + else: + expected = cat_original + tm.assert_categorical_equal(cat, expected) def test_iloc_with_boolean_operation(self): @@ -1395,8 +1410,9 @@ def test_frame_iloc_setitem_callable(self): class TestILocSeries: - def test_iloc(self): + def test_iloc(self, using_copy_on_write): ser = Series(np.random.randn(10), index=list(range(0, 20, 2))) + ser_original = ser.copy() for i in range(len(ser)): result = ser.iloc[i] @@ -1412,7 +1428,10 @@ def test_iloc(self): with tm.assert_produces_warning(None): # GH#45324 make sure we aren't giving a spurious FutureWarning result[:] = 0 - assert (ser.iloc[1:3] == 0).all() + if using_copy_on_write: + tm.assert_series_equal(ser, ser_original) + else: + assert (ser.iloc[1:3] == 0).all() # list of integers result = ser.iloc[[0, 2, 3, 4, 5]] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index b1eaf43f0b3685..cf7db65015fa75 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1066,7 +1066,9 @@ def test_loc_empty_list_indexer_is_ok(self): df.loc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True ) - def test_identity_slice_returns_new_object(self, using_array_manager, request): + def test_identity_slice_returns_new_object( + self, using_array_manager, request, using_copy_on_write + ): # GH13873 if using_array_manager: mark = pytest.mark.xfail( @@ -1083,8 +1085,12 @@ def test_identity_slice_returns_new_object(self, using_array_manager, request): assert np.shares_memory(original_df["a"]._values, sliced_df["a"]._values) # Setting using .loc[:, "a"] sets inplace so alters both sliced and orig + # depending on CoW original_df.loc[:, "a"] = [4, 4, 4] - assert (sliced_df["a"] == 4).all() + if using_copy_on_write: + assert (sliced_df["a"] == [1, 2, 3]).all() + else: + assert (sliced_df["a"] == 4).all() # These should not return copies assert original_df is original_df.loc[:, :] @@ -1098,7 +1104,10 @@ def test_identity_slice_returns_new_object(self, using_array_manager, request): assert original_series[:] is not original_series original_series[:3] = [7, 8, 9] - assert all(sliced_series[:3] == [7, 8, 9]) + if using_copy_on_write: + assert all(sliced_series[:3] == [1, 2, 3]) + else: + assert all(sliced_series[:3] == [7, 8, 9]) @pytest.mark.xfail(reason="accidental fix reverted - GH37497") def test_loc_copy_vs_view(self): @@ -2558,7 +2567,7 @@ def test_loc_setitem_boolean_and_column(self, float_frame): tm.assert_frame_equal(float_frame, expected) - def test_loc_setitem_ndframe_values_alignment(self): + def test_loc_setitem_ndframe_values_alignment(self, using_copy_on_write): # GH#45501 df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) df.loc[[False, False, True], ["a"]] = DataFrame( @@ -2579,9 +2588,13 @@ def test_loc_setitem_ndframe_values_alignment(self): tm.assert_frame_equal(df, expected) df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + df_orig = df.copy() ser = df["a"] ser.loc[[False, False, True]] = Series([10, 11, 12], index=[2, 1, 0]) - tm.assert_frame_equal(df, expected) + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + else: + tm.assert_frame_equal(df, expected) class TestLocListlike: diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py index 3c90eee5be9990..b30b27f5bae1a1 100644 --- a/pandas/tests/internals/test_internals.py +++ b/pandas/tests/internals/test_internals.py @@ -744,7 +744,7 @@ def test_reindex_items(self): mgr.iget(3).internal_values(), reindexed.iget(3).internal_values() ) - def test_get_numeric_data(self): + def test_get_numeric_data(self, using_copy_on_write): mgr = create_mgr( "int: int; float: float; complex: complex;" "str: object; bool: bool; obj: object; dt: datetime", @@ -765,10 +765,16 @@ def test_get_numeric_data(self): np.array([100.0, 200.0, 300.0]), inplace=True, ) - tm.assert_almost_equal( - mgr.iget(mgr.items.get_loc("float")).internal_values(), - np.array([100.0, 200.0, 300.0]), - ) + if using_copy_on_write: + tm.assert_almost_equal( + mgr.iget(mgr.items.get_loc("float")).internal_values(), + np.array([1.0, 1.0, 1.0]), + ) + else: + tm.assert_almost_equal( + mgr.iget(mgr.items.get_loc("float")).internal_values(), + np.array([100.0, 200.0, 300.0]), + ) numeric2 = mgr.get_numeric_data(copy=True) tm.assert_index_equal(numeric.items, Index(["int", "float", "complex", "bool"])) @@ -777,12 +783,18 @@ def test_get_numeric_data(self): np.array([1000.0, 2000.0, 3000.0]), inplace=True, ) - tm.assert_almost_equal( - mgr.iget(mgr.items.get_loc("float")).internal_values(), - np.array([100.0, 200.0, 300.0]), - ) + if using_copy_on_write: + tm.assert_almost_equal( + mgr.iget(mgr.items.get_loc("float")).internal_values(), + np.array([1.0, 1.0, 1.0]), + ) + else: + tm.assert_almost_equal( + mgr.iget(mgr.items.get_loc("float")).internal_values(), + np.array([100.0, 200.0, 300.0]), + ) - def test_get_bool_data(self): + def test_get_bool_data(self, using_copy_on_write): msg = "object-dtype columns with all-bool values" mgr = create_mgr( "int: int; float: float; complex: complex;" @@ -800,19 +812,31 @@ def test_get_bool_data(self): ) bools.iset(0, np.array([True, False, True]), inplace=True) - tm.assert_numpy_array_equal( - mgr.iget(mgr.items.get_loc("bool")).internal_values(), - np.array([True, False, True]), - ) + if using_copy_on_write: + tm.assert_numpy_array_equal( + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + np.array([True, True, True]), + ) + else: + tm.assert_numpy_array_equal( + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + np.array([True, False, True]), + ) # Check sharing with tm.assert_produces_warning(FutureWarning, match=msg): bools2 = mgr.get_bool_data(copy=True) bools2.iset(0, np.array([False, True, False])) - tm.assert_numpy_array_equal( - mgr.iget(mgr.items.get_loc("bool")).internal_values(), - np.array([True, False, True]), - ) + if using_copy_on_write: + tm.assert_numpy_array_equal( + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + np.array([True, True, True]), + ) + else: + tm.assert_numpy_array_equal( + mgr.iget(mgr.items.get_loc("bool")).internal_values(), + np.array([True, False, True]), + ) def test_unicode_repr_doesnt_raise(self): repr(create_mgr("b,\u05d0: object")) diff --git a/pandas/tests/reshape/test_from_dummies.py b/pandas/tests/reshape/test_from_dummies.py index c52331e54f95e7..ab804737252888 100644 --- a/pandas/tests/reshape/test_from_dummies.py +++ b/pandas/tests/reshape/test_from_dummies.py @@ -164,7 +164,7 @@ def test_error_with_prefix_default_category_dict_not_complete( def test_error_with_prefix_contains_nan(dummies_basic): - dummies_basic["col2_c"][2] = np.nan + dummies_basic.loc[2, "col2_c"] = np.nan with pytest.raises( ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'" ): @@ -172,7 +172,7 @@ def test_error_with_prefix_contains_nan(dummies_basic): def test_error_with_prefix_contains_non_dummies(dummies_basic): - dummies_basic["col2_c"][2] = "str" + dummies_basic.loc[2, "col2_c"] = "str" with pytest.raises(TypeError, match=r"Passed DataFrame contains non-dummy data"): from_dummies(dummies_basic, sep="_") diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 95daf001d8f2e0..11ad7c1dd457ef 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -279,7 +279,7 @@ def test_dt_accessor_ambiguous_freq_conversions(self): expected = Series(exp_values, name="xxx") tm.assert_series_equal(ser, expected) - def test_dt_accessor_not_writeable(self): + def test_dt_accessor_not_writeable(self, using_copy_on_write): # no setting allowed ser = Series(date_range("20130101", periods=5, freq="D"), name="xxx") with pytest.raises(ValueError, match="modifications"): @@ -288,8 +288,12 @@ def test_dt_accessor_not_writeable(self): # trying to set a copy msg = "modifications to a property of a datetimelike.+not supported" with pd.option_context("chained_assignment", "raise"): - with pytest.raises(SettingWithCopyError, match=msg): + if using_copy_on_write: + # TODO(CoW) it would be nice to keep a warning/error for this case ser.dt.hour[0] = 5 + else: + with pytest.raises(SettingWithCopyError, match=msg): + ser.dt.hour[0] = 5 @pytest.mark.parametrize( "method, dates", diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 7a3e18c64f366d..6ddffd0d006dc2 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -209,7 +209,8 @@ def test_basic_getitem_setitem_corner(datetime_series): datetime_series[[5, slice(None, None)]] = 2 -def test_slice(string_series, object_series): +def test_slice(string_series, object_series, using_copy_on_write): + original = string_series.copy() numSlice = string_series[10:20] numSliceEnd = string_series[-10:] objSlice = object_series[10:20] @@ -227,7 +228,11 @@ def test_slice(string_series, object_series): sl = string_series[10:20] sl[:] = 0 - assert (string_series[10:20] == 0).all() + if using_copy_on_write: + # Doesn't modify parent (CoW) + tm.assert_series_equal(string_series, original) + else: + assert (string_series[10:20] == 0).all() def test_timedelta_assignment(): @@ -244,7 +249,7 @@ def test_timedelta_assignment(): tm.assert_series_equal(s, expected) -def test_underlying_data_conversion(): +def test_underlying_data_conversion(using_copy_on_write): # GH 4080 df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]}) msg = "The 'inplace' keyword" @@ -253,15 +258,19 @@ def test_underlying_data_conversion(): assert return_value is None s = Series([1], index=[(2, 2, 2)]) df["val"] = 0 + df_original = df.copy() df df["val"].update(s) - expected = DataFrame( - {"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3], "val": [0, 1, 0]} - ) - with tm.assert_produces_warning(FutureWarning, match=msg): - return_value = expected.set_index(["a", "b", "c"], inplace=True) - assert return_value is None + if using_copy_on_write: + expected = df_original + else: + expected = DataFrame( + {"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3], "val": [0, 1, 0]} + ) + with tm.assert_produces_warning(FutureWarning, match=msg): + return_value = expected.set_index(["a", "b", "c"], inplace=True) + assert return_value is None tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/series/methods/test_copy.py b/pandas/tests/series/methods/test_copy.py index 8aa5c14812dc03..d681c0d02e0a2c 100644 --- a/pandas/tests/series/methods/test_copy.py +++ b/pandas/tests/series/methods/test_copy.py @@ -9,20 +9,28 @@ class TestCopy: - @pytest.mark.parametrize("deep", [None, False, True]) - def test_copy(self, deep): + @pytest.mark.parametrize("deep", ["default", None, False, True]) + def test_copy(self, deep, using_copy_on_write): ser = Series(np.arange(10), dtype="float64") # default deep is True - if deep is None: + if deep == "default": ser2 = ser.copy() else: ser2 = ser.copy(deep=deep) + if using_copy_on_write: + # INFO(CoW) a shallow copy doesn't yet copy the data + # but parent will not be modified (CoW) + if deep is None or deep is False: + assert np.may_share_memory(ser.values, ser2.values) + else: + assert not np.may_share_memory(ser.values, ser2.values) + ser2[::2] = np.NaN - if deep is None or deep is True: + if deep is not False or using_copy_on_write: # Did not modify original Series assert np.isnan(ser2[0]) assert not np.isnan(ser[0]) @@ -31,8 +39,8 @@ def test_copy(self, deep): assert np.isnan(ser2[0]) assert np.isnan(ser[0]) - @pytest.mark.parametrize("deep", [None, False, True]) - def test_copy_tzaware(self, deep): + @pytest.mark.parametrize("deep", ["default", None, False, True]) + def test_copy_tzaware(self, deep, using_copy_on_write): # GH#11794 # copy of tz-aware expected = Series([Timestamp("2012/01/01", tz="UTC")]) @@ -40,15 +48,23 @@ def test_copy_tzaware(self, deep): ser = Series([Timestamp("2012/01/01", tz="UTC")]) - if deep is None: + if deep == "default": ser2 = ser.copy() else: ser2 = ser.copy(deep=deep) + if using_copy_on_write: + # INFO(CoW) a shallow copy doesn't yet copy the data + # but parent will not be modified (CoW) + if deep is None or deep is False: + assert np.may_share_memory(ser.values, ser2.values) + else: + assert not np.may_share_memory(ser.values, ser2.values) + ser2[0] = Timestamp("1999/01/01", tz="UTC") # default deep is True - if deep is None or deep is True: + if deep is not False or using_copy_on_write: # Did not modify original Series tm.assert_series_equal(ser2, expected2) tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/series/methods/test_get_numeric_data.py b/pandas/tests/series/methods/test_get_numeric_data.py index e386f4b5b1dec7..60dd64d7e1948c 100644 --- a/pandas/tests/series/methods/test_get_numeric_data.py +++ b/pandas/tests/series/methods/test_get_numeric_data.py @@ -7,13 +7,20 @@ class TestGetNumericData: - def test_get_numeric_data_preserve_dtype(self): + def test_get_numeric_data_preserve_dtype(self, using_copy_on_write): # get the numeric data obj = Series([1, 2, 3]) result = obj._get_numeric_data() tm.assert_series_equal(result, obj) + # returned object is a shallow copy + result.iloc[0] = 0 + if using_copy_on_write: + assert obj.iloc[0] == 1 + else: + assert obj.iloc[0] == 0 + obj = Series([1, "2", 3.0]) result = obj._get_numeric_data() expected = Series([], dtype=object, index=Index([], dtype=object)) diff --git a/pandas/tests/series/methods/test_rename.py b/pandas/tests/series/methods/test_rename.py index 729c07b8bdde78..d0392929cb0828 100644 --- a/pandas/tests/series/methods/test_rename.py +++ b/pandas/tests/series/methods/test_rename.py @@ -143,10 +143,15 @@ def test_rename_error_arg(self): with pytest.raises(KeyError, match=match): ser.rename({2: 9}, errors="raise") - def test_rename_copy_false(self): + def test_rename_copy_false(self, using_copy_on_write): # GH 46889 ser = Series(["foo", "bar"]) + ser_orig = ser.copy() shallow_copy = ser.rename({1: 9}, copy=False) ser[0] = "foobar" - assert ser[0] == shallow_copy[0] - assert ser[1] == shallow_copy[9] + if using_copy_on_write: + assert ser_orig[0] == shallow_copy[0] + assert ser_orig[1] == shallow_copy[9] + else: + assert ser[0] == shallow_copy[0] + assert ser[1] == shallow_copy[9] diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index d9d6641d54237d..6403fcf76122a1 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -14,7 +14,7 @@ class TestUpdate: - def test_update(self): + def test_update(self, using_copy_on_write): s = Series([1.5, np.nan, 3.0, 4.0, np.nan]) s2 = Series([np.nan, 3.5, np.nan, 5.0]) s.update(s2) @@ -25,11 +25,15 @@ def test_update(self): # GH 3217 df = DataFrame([{"a": 1}, {"a": 3, "b": 2}]) df["c"] = np.nan + df_orig = df.copy() df["c"].update(Series(["foo"], index=[0])) - expected = DataFrame( - [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"] - ) + if using_copy_on_write: + expected = df_orig + else: + expected = DataFrame( + [[1, np.nan, "foo"], [3, 2.0, np.nan]], columns=["a", "b", "c"] + ) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize(