diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 76f68fdaa7845..d507e8d3dbac5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3967,8 +3967,15 @@ def _set_value( """ try: if takeable: - series = self._ixs(col, axis=1) - series._set_value(index, value, takeable=True) + if isinstance(self._mgr, ArrayManager): + # with CoW, we can't use intermediate series + # with takeable=True, we know that index is positional and + # not a generic hashable label + index = cast(int, index) + self._mgr.column_setitem(col, index, value) + else: + series = self._ixs(col, axis=1) + series._set_value(index, value, takeable=True) return series = self._get_item_cache(col) @@ -4900,7 +4907,7 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): "labels", [ ("method", None), - ("copy", True), + ("copy", None), ("level", None), ("fill_value", np.nan), ("limit", None), @@ -5084,7 +5091,7 @@ def rename( index: Renamer | None = None, columns: Renamer | None = None, axis: Axis | None = None, - copy: bool = True, + copy: bool | None = None, inplace: bool = False, level: Level | None = None, errors: str = "ignore", @@ -5900,7 +5907,7 @@ class max type if inplace: new_obj = self else: - new_obj = self.copy() + new_obj = self.copy(deep=None) new_index = default_index(len(new_obj)) if level is not None: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6dd3df17139c3..d88bea4b538e9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -996,7 +996,7 @@ def rename( index: Renamer | None = None, columns: Renamer | None = None, axis: Axis | None = None, - copy: bool_t = True, + copy: bool_t | None = None, inplace: bool_t = False, level: Level | None = None, errors: str = "ignore", @@ -3952,6 +3952,8 @@ def _check_setitem_copy(self, t="setting", force=False): df.iloc[0:5]['group'] = 'a' """ + if isinstance(self._mgr, (ArrayManager, SingleArrayManager)): + return # return early if the check is not needed if not (force or self._is_copy): return @@ -4906,7 +4908,7 @@ def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT: axes, kwargs = self._construct_axes_from_arguments(args, kwargs) method = missing.clean_reindex_fill_method(kwargs.pop("method", None)) level = kwargs.pop("level", None) - copy = kwargs.pop("copy", True) + copy = kwargs.pop("copy", None) limit = kwargs.pop("limit", None) tolerance = kwargs.pop("tolerance", None) fill_value = kwargs.pop("fill_value", None) @@ -4931,9 +4933,7 @@ def reindex(self: NDFrameT, *args, **kwargs) -> NDFrameT: for axis, ax in axes.items() if ax is not None ): - if copy: - return self.copy() - return self + return self.copy(deep=copy) # check if we are a multi reindex if self._needs_reindex_multi(axes, method, level): @@ -5895,7 +5895,7 @@ def astype( return cast(NDFrameT, result) @final - def copy(self: NDFrameT, deep: bool_t = True) -> NDFrameT: + def copy(self: NDFrameT, deep: bool_t | None = True) -> NDFrameT: """ Make a copy of this object's indices and data. diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index fc2204724aceb..e6958c9af4ed6 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1840,6 +1840,17 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): """ pi = plane_indexer + if not hasattr(self.obj._mgr, "blocks"): + # ArrayManager: in this case we cannot rely on getting the column + # as a Series to mutate, but need to operated on the mgr directly + if com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)): + arr = self.obj._sanitize_column(value) + self.obj._mgr.iset(loc, arr) + else: + self.obj._mgr.column_setitem(loc, plane_indexer, value) + self.obj._clear_item_cache() + return + ser = self.obj._ixs(loc, axis=1) # perform the equivalent of a setitem on the info axis diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 1cd9fe65407ba..47abcf68243a2 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -10,6 +10,7 @@ Hashable, TypeVar, ) +import weakref import numpy as np @@ -110,6 +111,7 @@ class BaseArrayManager(DataManager): ---------- arrays : Sequence of arrays axes : Sequence of Index + refs : Sequence of weakrefs or None, optional verify_integrity : bool, default True """ @@ -121,11 +123,13 @@ class BaseArrayManager(DataManager): arrays: list[np.ndarray | ExtensionArray] _axes: list[Index] + refs: list[weakref.ref | None] | None def __init__( self, arrays: list[np.ndarray | ExtensionArray], axes: list[Index], + refs: list[weakref.ref | None] | None = None, verify_integrity: bool = True, ): raise NotImplementedError @@ -167,6 +171,24 @@ def set_axis(self, axis: int, new_labels: Index) -> None: axis = self._normalize_axis(axis) self._axes[axis] = new_labels + def _has_no_reference(self, i: int) -> bool: + """ + Check for column `i` if has references. + (whether it references another array or is itself being referenced) + + Returns True if the columns has no references. + """ + return (self.refs is None or self.refs[i] is None) and weakref.getweakrefcount( + self.arrays[i] + ) == 0 + + def _clear_reference(self, i: int) -> None: + """ + Clear any reference for column `i`. + """ + if self.refs is not None: + self.refs[i] = None + def get_dtypes(self): return np.array([arr.dtype for arr in self.arrays], dtype="object") @@ -177,6 +199,10 @@ def __setstate__(self, state): self.arrays = state[0] self._axes = state[1] + def __reduce__(self): + # to avoid pickling the refs + return type(self), (self.arrays, self._axes) + def __repr__(self) -> str: output = type(self).__name__ output += f"\nIndex: {self._axes[0]}" @@ -352,12 +378,20 @@ def putmask(self, mask, new, align: bool = True): align_keys = ["mask"] new = extract_array(new, extract_numpy=True) - return self.apply_with_block( + for i in range(len(self.arrays)): + if not self._has_no_reference(i): + # if being referenced -> perform Copy-on-Write and clear the reference + self.arrays[i] = self.arrays[i].copy() + self._clear_reference(i) + + new_mgr = self.apply_with_block( "putmask", align_keys=align_keys, mask=mask, new=new, ) + self.arrays = new_mgr.arrays + return self def diff(self: T, n: int, axis: int) -> T: if axis == 1: @@ -505,14 +539,18 @@ def copy(self: T, deep=True) -> T: Parameters ---------- - deep : bool or string, default True - If False, return shallow copy (do not copy data) + deep : bool, string or None, default True + If False or None, return a shallow copy (do not copy data). If 'all', copy data and a deep copy of the index Returns ------- BlockManager """ + if deep is None: + # use shallow copy + deep = False + # this preserves the notion of view copying of axes if deep: # hit in e.g. tests.io.json.test_pandas @@ -526,9 +564,12 @@ def copy_func(ax): if deep: new_arrays = [arr.copy() for arr in self.arrays] + refs = None else: - new_arrays = self.arrays - return type(self)(new_arrays, new_axes) + new_arrays = list(self.arrays) + refs: list[weakref.ref | None] = [weakref.ref(arr) for arr in self.arrays] + + return type(self)(new_arrays, new_axes, refs, verify_integrity=False) def reindex_indexer( self: T, @@ -537,7 +578,7 @@ def reindex_indexer( axis: int, fill_value=None, allow_dups: bool = False, - copy: bool = True, + copy: bool | None = True, # ignored keywords consolidate: bool = True, only_slice: bool = False, @@ -562,7 +603,7 @@ def _reindex_indexer( axis: int, fill_value=None, allow_dups: bool = False, - copy: bool = True, + copy: bool | None = True, use_na_proxy: bool = False, ) -> T: """ @@ -573,11 +614,15 @@ def _reindex_indexer( axis : int fill_value : object, default None allow_dups : bool, default False - copy : bool, default True - + copy : bool or None, default True + If None, regard as False to get shallow copy. pandas-indexer with -1's only. """ + if copy is None: + # use shallow copy + copy = False + if indexer is None: if new_axis is self._axes[axis] and not copy: return self @@ -594,18 +639,26 @@ def _reindex_indexer( if axis >= self.ndim: raise IndexError("Requested axis not found in manager") + refs: list[weakref.ref | None] | None = None if axis == 1: new_arrays = [] + refs = [] for i in indexer: if i == -1: arr = self._make_na_array( fill_value=fill_value, use_na_proxy=use_na_proxy ) + ref = None else: + # reusing full column array -> track with reference arr = self.arrays[i] if copy: arr = arr.copy() + ref = None + else: + ref = weakref.ref(arr) new_arrays.append(arr) + refs.append(ref) else: validate_indices(indexer, len(self._axes[0])) @@ -624,11 +677,14 @@ def _reindex_indexer( ) for arr in self.arrays ] + # selecting rows with take always creates a copy -> no need to + # track references to original arrays + refs = None new_axes = list(self._axes) new_axes[axis] = new_axis - return type(self)(new_arrays, new_axes, verify_integrity=False) + return type(self)(new_arrays, new_axes, refs, verify_integrity=False) def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: """ @@ -650,7 +706,7 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: new_labels = self._axes[axis].take(indexer) return self._reindex_indexer( - new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True + new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True, copy=None ) def _make_na_array(self, fill_value=None, use_na_proxy=False): @@ -692,12 +748,14 @@ def __init__( self, arrays: list[np.ndarray | ExtensionArray], axes: list[Index], + refs: list[weakref.ref | None] | None = None, verify_integrity: bool = True, ): # Note: we are storing the axes in "_axes" in the (row, columns) order # which contrasts the order how it is stored in BlockManager self._axes = axes self.arrays = arrays + self.refs = refs if verify_integrity: self._axes = [ensure_index(ax) for ax in axes] @@ -728,6 +786,12 @@ def _verify_integrity(self) -> None: "Passed arrays should be 1-dimensional, got array with " f"{arr.ndim} dimensions instead." ) + if self.refs is not None: + if len(self.refs) != n_columns: + raise ValueError( + "Number of passed refs must equal the size of the column Index: " + f"{len(self.refs)} refs vs {n_columns} columns." + ) # -------------------------------------------------------------------- # Indexing @@ -761,22 +825,30 @@ def fast_xs(self, loc: int) -> ArrayLike: def get_slice(self, slobj: slice, axis: int = 0) -> ArrayManager: axis = self._normalize_axis(axis) + refs: list[weakref.ref | None] if axis == 0: arrays = [arr[slobj] for arr in self.arrays] + # slicing results in views -> track references to original arrays + # TODO possible to optimize this with single ref to the full ArrayManager? + refs = [weakref.ref(arr) for arr in self.arrays] elif axis == 1: arrays = self.arrays[slobj] + # track reference to subset of column arrays + refs = [weakref.ref(arr) for arr in arrays] new_axes = list(self._axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - return type(self)(arrays, new_axes, verify_integrity=False) + return type(self)(arrays, new_axes, refs, verify_integrity=False) def iget(self, i: int) -> SingleArrayManager: """ Return the data as a SingleArrayManager. """ values = self.arrays[i] - return SingleArrayManager([values], [self._axes[0]]) + # getting single column array for Series -> track reference to original + ref = weakref.ref(values) + return SingleArrayManager([values], [self._axes[0]], [ref]) def iget_values(self, i: int) -> ArrayLike: """ @@ -808,6 +880,7 @@ def iset( inplace : bool, default False Whether overwrite existing array as opposed to replacing it. """ + # TODO clear reference for item that is being overwritten # single column -> single integer index if lib.is_integer(loc): @@ -882,6 +955,10 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: # TODO is this copy needed? arrays = self.arrays.copy() arrays.insert(loc, value) + if self.refs is not None: + # inserted `value` is already a copy, no need to track reference + # TODO can we use CoW here as well? + self.refs.insert(loc, None) self.arrays = arrays self._axes[1] = new_axis @@ -895,8 +972,28 @@ def idelete(self, indexer): self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]] self._axes = [self._axes[0], self._axes[1][to_keep]] + if self.refs is not None: + self.refs = [ref for i, ref in enumerate(self.refs) if to_keep[i]] return self + def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value): + if self._has_no_reference(loc): + # if no reference -> set array (potentially) inplace + arr = self.arrays[loc] + # TODO we should try to avoid this (indexing.py::_setitem_single_column + # does a copy for the BM path as well) + arr = arr.copy() + else: + # otherwise perform Copy-on-Write and clear the reference + arr = self.arrays[loc].copy() + self._clear_reference(loc) + + # create temporary SingleArrayManager without ref to use setitem implementation + mgr = SingleArrayManager([arr], [self._axes[0]]) + new_mgr = mgr.setitem((idx,), value) + # update existing ArrayManager in-place + self.arrays[loc] = new_mgr.arrays[0] + # -------------------------------------------------------------------- # Array-wise Operation @@ -1152,10 +1249,12 @@ def __init__( self, arrays: list[np.ndarray | ExtensionArray], axes: list[Index], + refs: list[weakref.ref | None] | None = None, verify_integrity: bool = True, ): self._axes = axes self.arrays = arrays + self.refs = refs if verify_integrity: assert len(axes) == 1 @@ -1236,6 +1335,7 @@ def fast_xs(self, loc: int) -> ArrayLike: raise NotImplementedError("Use series._values[loc] instead") def get_slice(self, slobj: slice, axis: int = 0) -> SingleArrayManager: + # TODO track reference if axis >= self.ndim: raise IndexError("Requested axis not found in manager") @@ -1246,7 +1346,9 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleArrayManager: def getitem_mgr(self, indexer) -> SingleArrayManager: new_array = self.array[indexer] new_index = self.index[indexer] - return type(self)([new_array], [new_index]) + # TODO in theory only need to track reference if new_array is a view + ref = weakref.ref(self.array) + return type(self)([new_array], [new_index], [ref]) def apply(self, func, **kwargs): if callable(func): @@ -1264,8 +1366,28 @@ def setitem(self, indexer, value): See `setitem_inplace` for a version that works inplace and doesn't return a new Manager. """ + if not self._has_no_reference(0): + # if being referenced -> perform Copy-on-Write and clear the reference + self.arrays[0] = self.arrays[0].copy() + self._clear_reference(0) return self.apply_with_block("setitem", indexer=indexer, value=value) + def setitem_inplace(self, indexer, value) -> None: + """ + Set values with indexer. + + For Single[Block/Array]Manager, this backs s[indexer] = value + + This is an inplace version of `setitem()`, mutating the manager/values + in place, not returning a new Manager (and Block), and thus never changing + the dtype. + """ + if not self._has_no_reference(0): + # if being referenced -> perform Copy-on-Write and clear the reference + self.arrays[0] = self.arrays[0].copy() + self._clear_reference(0) + self.array[indexer] = value + def idelete(self, indexer) -> SingleArrayManager: """ Delete selected locations in-place (new array, same ArrayManager) @@ -1275,6 +1397,8 @@ def idelete(self, indexer) -> SingleArrayManager: self.arrays = [self.arrays[0][to_keep]] self._axes = [self._axes[0][to_keep]] + # clear reference since we are backed by new array + self.refs = None return self def _get_data_subset(self, predicate: Callable) -> SingleArrayManager: diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index d69709bf9d06c..a7125ad4cafd6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -582,6 +582,9 @@ def copy(self: T, deep=True) -> T: ------- BlockManager """ + if deep is None: + # preserve deep copy for BlockManager with copy=None + deep = True # this preserves the notion of view copying of axes if deep: # hit in e.g. tests.io.json.test_pandas @@ -655,6 +658,10 @@ def reindex_indexer( pandas-indexer with -1's only. """ + if copy is None: + # preserve deep copy for BlockManager with copy=None + copy = True + if indexer is None: if new_axis is self.axes[axis] and not copy: return self diff --git a/pandas/core/series.py b/pandas/core/series.py index 03fac7cceabb7..defddcfd83639 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1260,7 +1260,13 @@ def _maybe_update_cacher( # a copy if ref is None: del self._cacher - elif len(self) == len(ref) and self.name in ref.columns: + # for ArrayManager with CoW, we never want to update the parent + # DataFrame cache if the Series changed, and always pop the cached item + elif ( + len(self) == len(ref) + and self.name in ref.columns + and not isinstance(self._mgr, SingleArrayManager) + ): # GH#42530 self.name must be in ref.columns # to ensure column still in dataframe # otherwise, either self or ref has swapped in new arrays diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index 942da38dc5a26..91e92278a44d9 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -269,7 +269,7 @@ def test_setattr_column(self): df.foobar = 5 assert (df.foobar == 5).all() - def test_setitem(self, float_frame): + def test_setitem(self, float_frame, using_array_manager): # not sure what else to do here series = float_frame["A"][::2] float_frame["col5"] = series @@ -305,8 +305,12 @@ def test_setitem(self, float_frame): smaller = float_frame[:2] msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: + # With ArrayManager, adding a new column doesn't raise a warning smaller["col10"] = ["1", "2"] + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + smaller["col10"] = ["1", "2"] assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() @@ -1007,14 +1011,18 @@ def test_iloc_row_slice_view(self, using_array_manager): assert np.shares_memory(df[2], subset[2]) + exp_col = original[2].copy() + msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): - subset.loc[:, 2] = 0.0 + if using_array_manager: + # INFO(ArrayManager) doesn't modify parent + subset[2] = 0.0 + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + subset.loc[:, 2] = 0.0 - exp_col = original[2].copy() - # TODO(ArrayManager) verify it is expected that the original didn't change - if not using_array_manager: exp_col[4:8] = 0.0 + tm.assert_series_equal(df[2], exp_col) def test_iloc_col(self): diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index c6938abb57d64..3a1f9d8a527d8 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -116,15 +116,12 @@ def test_xs_view(self, using_array_manager): dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5)) + dm.xs(2)[:] = 20 if using_array_manager: # INFO(ArrayManager) with ArrayManager getting a row as a view is # not possible - msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): - dm.xs(2)[:] = 20 assert not (dm.xs(2) == 20).any() else: - dm.xs(2)[:] = 20 assert (dm.xs(2) == 20).all() @@ -175,27 +172,41 @@ def test_xs_level_eq_2(self): result = df.xs("c", level=2) tm.assert_frame_equal(result, expected) - def test_xs_setting_with_copy_error(self, multiindex_dataframe_random_data): + def test_xs_setting_with_copy_error( + self, multiindex_dataframe_random_data, using_array_manager + ): # this is a copy in 0.14 df = multiindex_dataframe_random_data + df_orig = df.copy() result = df.xs("two", level="second") - # setting this will give a SettingWithCopyError - # as we are trying to write a view - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if not using_array_manager: + # setting this will give a SettingWithCopyError + # as we are trying to write a view + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + result[:] = 10 + else: result[:] = 10 + tm.assert_frame_equal(df, df_orig) - def test_xs_setting_with_copy_error_multiple(self, four_level_index_dataframe): + def test_xs_setting_with_copy_error_multiple( + self, four_level_index_dataframe, using_array_manager + ): # this is a copy in 0.14 df = four_level_index_dataframe + df_orig = df.copy() result = df.xs(("a", 4), level=["one", "four"]) - # setting this will give a SettingWithCopyError - # as we are trying to write a view - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if not using_array_manager: + # setting this will give a SettingWithCopyError + # as we are trying to write a view + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + result[:] = 10 + else: result[:] = 10 + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("key, level", [("one", "second"), (["one"], ["second"])]) def test_xs_with_duplicates(self, key, level, multiindex_dataframe_random_data): diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index fc485f14a4820..941f54c4ccb3e 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -66,7 +66,7 @@ def test_combine_first(self, float_frame): assert (combined["A"][:10] == 1).all() # reverse overlap - tail["A"][:10] = 0 + tail.iloc[:10, tail.columns.get_loc("A")] = 0 combined = tail.combine_first(head) assert (combined["A"][:10] == 0).all() diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 60d5d8c8ccaca..b551cfc500e7a 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -27,8 +27,8 @@ def test_cov(self, float_frame, float_string_frame): # with NAs frame = float_frame.copy() - frame["A"][:5] = np.nan - frame["B"][5:10] = np.nan + frame.iloc[:5, frame.columns.get_loc("A")] = np.nan + frame.iloc[5:10, frame.columns.get_loc("B")] = np.nan result = frame.cov(min_periods=len(frame) - 8) expected = frame.cov() expected.loc["A", "B"] = np.nan diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 93225ff1050a3..e19dd5ae42c2d 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -254,6 +254,8 @@ def test_interp_raise_on_all_object_dtype(self): with pytest.raises(TypeError, match=msg): df.interpolate() + # TODO(CoW) inplace method on Series -> OK with not updating parent? + @td.skip_array_manager_not_yet_implemented def test_interp_inplace(self): df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]}) expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]}) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 408113e9bc417..add190de73995 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -138,6 +140,8 @@ def test_update_datetime_tz(self): expected = DataFrame([pd.Timestamp("2019", tz="UTC")]) tm.assert_frame_equal(result, expected) + # TODO(CoW) what should the update method do? -> deprecate this? + @td.skip_array_manager_not_yet_implemented def test_update_with_different_dtype(self): # GH#3217 df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index 0e6b36a484c47..48e561a05225b 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1192,7 +1192,8 @@ def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame): frame_copy = float_frame.reindex(float_frame.index[::2]) del frame_copy["D"] - frame_copy["C"][:5] = np.nan + # adding NAs to column "C" + frame_copy.iloc[:5, frame_copy.columns.get_loc("C")] = np.nan added = float_frame + frame_copy diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 7347640fc05a7..70e28be2462fc 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -258,11 +258,16 @@ def test_constructor_dtype_copy(self): new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 - def test_constructor_dtype_nocast_view_dataframe(self): + def test_constructor_dtype_nocast_view_dataframe(self, using_array_manager): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) - should_be_view[0][0] = 99 - assert df.values[0, 0] == 99 + if using_array_manager: + # INFO(ArrayManager) doesn't mutate original + should_be_view.iloc[0, 0] = 99 + assert df.values[0, 0] == 1 + else: + should_be_view[0][0] = 99 + assert df.values[0, 0] == 99 @td.skip_array_manager_invalid_test # TODO(ArrayManager) keep view on 2D array? def test_constructor_dtype_nocast_view_2d_array(self): diff --git a/pandas/tests/indexes/period/test_partial_slicing.py b/pandas/tests/indexes/period/test_partial_slicing.py index c565902d080c3..bd477675ce714 100644 --- a/pandas/tests/indexes/period/test_partial_slicing.py +++ b/pandas/tests/indexes/period/test_partial_slicing.py @@ -12,16 +12,20 @@ class TestPeriodIndex: - def test_getitem_periodindex_duplicates_string_slice(self): + def test_getitem_periodindex_duplicates_string_slice(self, using_array_manager): # monotonic idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") ts = Series(np.random.randn(len(idx)), index=idx) + original = ts.copy() result = ts["2007"] expected = ts[1:3] tm.assert_series_equal(result, expected) result[:] = 1 - assert (ts[1:3] == 1).all() + if using_array_manager: + tm.assert_series_equal(ts, original) + else: + assert (ts[1:3] == 1).all() # not monotonic idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN") diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index 6ccd44e698a8a..d4fe88c835ee1 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -12,6 +12,7 @@ import pandas.core.common as com +@td.skip_array_manager_not_yet_implemented def test_detect_chained_assignment(): # Inplace ops, originally from: # https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 2a12d690ff0bd..ceda30e4f357e 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -408,16 +408,24 @@ def test_setitem_change_dtype(self, multiindex_dataframe_random_data): reindexed = dft.reindex(columns=[("foo", "two")]) tm.assert_series_equal(reindexed["foo", "two"], s > s.median()) - def test_set_column_scalar_with_loc(self, multiindex_dataframe_random_data): + def test_set_column_scalar_with_loc( + self, multiindex_dataframe_random_data, using_array_manager + ): frame = multiindex_dataframe_random_data subset = frame.index[[1, 4, 5]] frame.loc[subset] = 99 assert (frame.loc[subset].values == 99).all() + frame_original = frame.copy() + col = frame["B"] col[subset] = 97 - assert (frame.loc[subset, "B"] == 97).all() + if using_array_manager: + # chained setitem doesn't work with CoW + tm.assert_frame_equal(frame, frame_original) + else: + assert (frame.loc[subset, "B"] == 97).all() def test_nonunique_assignment_1750(self): df = DataFrame( @@ -490,21 +498,31 @@ def test_frame_setitem_view_direct(multiindex_dataframe_random_data): assert (df["foo"].values == 0).all() -def test_frame_setitem_copy_raises(multiindex_dataframe_random_data): +def test_frame_setitem_copy_raises( + multiindex_dataframe_random_data, using_array_manager +): # will raise/warn as its chained assignment df = multiindex_dataframe_random_data.T - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: + # TODO(CoW) it would be nice if this could still warn/raise df["foo"]["one"] = 2 + else: + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + df["foo"]["one"] = 2 -def test_frame_setitem_copy_no_write(multiindex_dataframe_random_data): +def test_frame_setitem_copy_no_write( + multiindex_dataframe_random_data, using_array_manager +): frame = multiindex_dataframe_random_data.T expected = frame df = frame.copy() - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: df["foo"]["one"] = 2 + else: + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + df["foo"]["one"] = 2 - result = df - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 90bda69eaf139..f3c06e04c81f6 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -31,7 +31,7 @@ def random_text(nobs=100): class TestCaching: - def test_slice_consolidate_invalidate_item_cache(self): + def test_slice_consolidate_invalidate_item_cache(self, using_array_manager): # this is chained assignment, but will 'work' with option_context("chained_assignment", None): @@ -51,7 +51,11 @@ def test_slice_consolidate_invalidate_item_cache(self): # Assignment to wrong series df["bb"].iloc[0] = 0.17 df._clear_item_cache() - tm.assert_almost_equal(df["bb"][0], 0.17) + if not using_array_manager: + tm.assert_almost_equal(df["bb"][0], 0.17) + else: + # with ArrayManager, parent is not mutated with chained assignment + tm.assert_almost_equal(df["bb"][0], 2.2) @pytest.mark.parametrize("do_ref", [True, False]) def test_setitem_cache_updating(self, do_ref): @@ -70,7 +74,7 @@ def test_setitem_cache_updating(self, do_ref): assert df.loc[0, "c"] == 0.0 assert df.loc[7, "c"] == 1.0 - def test_setitem_cache_updating_slices(self): + def test_setitem_cache_updating_slices(self, using_array_manager): # GH 7084 # not updating cache on series setting with slices expected = DataFrame( @@ -91,12 +95,17 @@ def test_setitem_cache_updating_slices(self): # try via a chain indexing # this actually works out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) + out_original = out.copy() for ix, row in df.iterrows(): v = out[row["C"]][six:eix] + row["D"] out[row["C"]][six:eix] = v - tm.assert_frame_equal(out, expected) - tm.assert_series_equal(out["A"], expected["A"]) + if not using_array_manager: + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out["A"], expected["A"]) + else: + tm.assert_frame_equal(out, out_original) + tm.assert_series_equal(out["A"], out_original["A"]) out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) for ix, row in df.iterrows(): @@ -122,6 +131,8 @@ def test_altering_series_clears_parent_cache(self): class TestChaining: + # TODO(CoW) fix Series setitem with mask + @td.skip_array_manager_not_yet_implemented def test_setitem_chained_setfault(self): # GH6026 @@ -157,18 +168,22 @@ def test_setitem_chained_setfault(self): tm.assert_frame_equal(result, expected) @pytest.mark.arm_slow - def test_detect_chained_assignment(self): + def test_detect_chained_assignment(self, using_array_manager): pd.set_option("chained_assignment", "raise") # work with the chain expected = DataFrame([[-5, 1], [-6, 3]], columns=list("AB")) df = DataFrame(np.arange(4).reshape(2, 2), columns=list("AB"), dtype="int64") + df_original = df.copy() assert df._is_copy is None df["A"][0] = -5 df["A"][1] = -6 - tm.assert_frame_equal(df, expected) + if using_array_manager: + tm.assert_frame_equal(df, df_original) + else: + tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow def test_detect_chained_assignment_raises(self, using_array_manager): @@ -180,6 +195,7 @@ def test_detect_chained_assignment_raises(self, using_array_manager): "B": np.array(np.arange(2, 4), dtype=np.float64), } ) + df_original = df.copy() assert df._is_copy is None if not using_array_manager: @@ -196,12 +212,10 @@ def test_detect_chained_assignment_raises(self, using_array_manager): # a mixed dataframe df["A"][0] = -5 df["A"][1] = -6 - expected = DataFrame([[-5, 2], [-6, 3]], columns=list("AB")) - expected["B"] = expected["B"].astype("float64") - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, df_original) @pytest.mark.arm_slow - def test_detect_chained_assignment_fails(self): + def test_detect_chained_assignment_fails(self, using_array_manager): # Using a copy (the chain), fails df = DataFrame( @@ -211,11 +225,15 @@ def test_detect_chained_assignment_fails(self): } ) - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: + # TODO(CoW) can we still warn here? df.loc[0]["A"] = -5 + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + df.loc[0]["A"] = -5 @pytest.mark.arm_slow - def test_detect_chained_assignment_doc_example(self): + def test_detect_chained_assignment_doc_example(self, using_array_manager): # Doc example df = DataFrame( @@ -226,30 +244,36 @@ def test_detect_chained_assignment_doc_example(self): ) assert df._is_copy is None - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: + # TODO(CoW) can we still warn here? indexer = df.a.str.startswith("o") df[indexer]["c"] = 42 + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + indexer = df.a.str.startswith("o") + df[indexer]["c"] = 42 @pytest.mark.arm_slow def test_detect_chained_assignment_object_dtype(self, using_array_manager): expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) - - with pytest.raises(com.SettingWithCopyError, match=msg): - df.loc[0]["A"] = 111 + df_original = df.copy() if not using_array_manager: + with pytest.raises(com.SettingWithCopyError, match=msg): + df.loc[0]["A"] = 111 + with pytest.raises(com.SettingWithCopyError, match=msg): df["A"][0] = 111 df.loc[0, "A"] = 111 + tm.assert_frame_equal(df, expected) else: - # INFO(ArrayManager) for ArrayManager it doesn't matter that it's - # a mixed dataframe + # TODO(CoW) can we still warn here? df["A"][0] = 111 - - tm.assert_frame_equal(df, expected) + df.loc[0]["A"] = 111 + tm.assert_frame_equal(df, df_original) @pytest.mark.arm_slow def test_detect_chained_assignment_is_copy_pickle(self): @@ -297,6 +321,7 @@ def test_detect_chained_assignment_implicit_take(self): df["letters"] = df["letters"].apply(str.lower) @pytest.mark.arm_slow + @td.skip_array_manager_invalid_test # _is_copy is not always set for AM def test_detect_chained_assignment_implicit_take2(self): # Implicitly take 2 @@ -354,15 +379,21 @@ def test_detect_chained_assignment_false_positives(self): str(df) @pytest.mark.arm_slow - def test_detect_chained_assignment_undefined_column(self): + def test_detect_chained_assignment_undefined_column(self, using_array_manager): # from SO: # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc df = DataFrame(np.arange(0, 9), columns=["count"]) df["group"] = "b" + df_original = df.copy() - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: + # TODO(CoW) can we still warn here? df.iloc[0:5]["group"] = "a" + tm.assert_frame_equal(df, df_original) + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + df.iloc[0:5]["group"] = "a" @pytest.mark.arm_slow def test_detect_chained_assignment_changing_dtype(self, using_array_manager): @@ -376,32 +407,39 @@ def test_detect_chained_assignment_changing_dtype(self, using_array_manager): "D": ["a", "b", "c", "d", "e"], } ) + df_original = df.copy() - with pytest.raises(com.SettingWithCopyError, match=msg): - df.loc[2]["D"] = "foo" + if not using_array_manager: + with pytest.raises(com.SettingWithCopyError, match=msg): + df.loc[2]["D"] = "foo" - with pytest.raises(com.SettingWithCopyError, match=msg): - df.loc[2]["C"] = "foo" + with pytest.raises(com.SettingWithCopyError, match=msg): + df.loc[2]["C"] = "foo" - if not using_array_manager: with pytest.raises(com.SettingWithCopyError, match=msg): df["C"][2] = "foo" else: - # INFO(ArrayManager) for ArrayManager it doesn't matter if it's - # changing the dtype or not + # TODO(CoW) can we still warn here? + # df.loc[2]["D"] = "foo" + # df.loc[2]["C"] = "foo" df["C"][2] = "foo" - assert df.loc[2, "C"] == "foo" + tm.assert_frame_equal(df, df_original) - def test_setting_with_copy_bug(self): + def test_setting_with_copy_bug(self, using_array_manager): # operating on a copy df = DataFrame( {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} ) + df_original = df.copy() mask = pd.isna(df.c) - with pytest.raises(com.SettingWithCopyError, match=msg): + if not using_array_manager: + with pytest.raises(com.SettingWithCopyError, match=msg): + df[["c"]][mask] = df[["b"]][mask] + else: df[["c"]][mask] = df[["b"]][mask] + tm.assert_frame_equal(df, df_original) def test_setting_with_copy_bug_no_warning(self): # invalid warning as we are returning a new object @@ -412,6 +450,7 @@ def test_setting_with_copy_bug_no_warning(self): # this should not raise df2["y"] = ["g", "h", "i"] + @td.skip_array_manager_not_yet_implemented # TODO(CoW) can we still warn/raise? def test_detect_chained_assignment_warnings_errors(self): df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) with option_context("chained_assignment", "warn"): @@ -422,28 +461,44 @@ def test_detect_chained_assignment_warnings_errors(self): with pytest.raises(com.SettingWithCopyError, match=msg): df.loc[0]["A"] = 111 - def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self): + def test_detect_chained_assignment_warnings_filter_and_dupe_cols( + self, using_array_manager + ): # xref gh-13017. with option_context("chained_assignment", "warn"): df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"]) + df_original = df.copy() - with tm.assert_produces_warning(com.SettingWithCopyWarning): + warn = None if using_array_manager else com.SettingWithCopyWarning + with tm.assert_produces_warning(warn): df.c.loc[df.c > 0] = None expected = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"] ) - tm.assert_frame_equal(df, expected) + if using_array_manager: + tm.assert_frame_equal(df, df_original) + else: + tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("rhs", [3, DataFrame({0: [1, 2, 3, 4]})]) - def test_detect_chained_assignment_warning_stacklevel(self, rhs): + def test_detect_chained_assignment_warning_stacklevel( + self, rhs, using_array_manager + ): # GH#42570 df = DataFrame(np.arange(25).reshape(5, 5)) + df_original = df.copy() chained = df.loc[:3] with option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(com.SettingWithCopyWarning) as t: - chained[2] = rhs - assert t[0].filename == __file__ + if not using_array_manager: + with tm.assert_produces_warning(com.SettingWithCopyWarning) as t: + chained[2] = rhs + assert t[0].filename == __file__ + else: + # INFO(CoW) no warning, and original dataframe not changed + with tm.assert_produces_warning(None): + chained[2] = rhs + tm.assert_frame_equal(df, df_original) # TODO(ArrayManager) fast_xs with array-like scalars is not yet working @td.skip_array_manager_not_yet_implemented @@ -494,7 +549,7 @@ def test_cache_updating2(self): expected = Series([0, 0, 0, 2, 0], name="f") tm.assert_series_equal(df.f, expected) - def test_iloc_setitem_chained_assignment(self): + def test_iloc_setitem_chained_assignment(self, using_array_manager): # GH#3970 with option_context("chained_assignment", None): df = DataFrame({"aa": range(5), "bb": [2.2] * 5}) @@ -508,7 +563,10 @@ def test_iloc_setitem_chained_assignment(self): df_tmp = df.iloc[ck] # noqa df["bb"].iloc[0] = 0.15 - assert df["bb"].iloc[0] == 0.15 + if not using_array_manager: + assert df["bb"].iloc[0] == 0.15 + else: + assert df["bb"].iloc[0] == 2.2 def test_getitem_loc_assignment_slice_state(self): # GH 13569 diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py new file mode 100644 index 0000000000000..ce5c6e007a3d5 --- /dev/null +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -0,0 +1,598 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +import pandas.core.common as com + + +def test_copy(using_array_manager): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_copy = df.copy() + + # the deep copy doesn't share memory + assert not np.may_share_memory(df_copy["a"].values, df["a"].values) + if using_array_manager: + assert df_copy._mgr.refs is None + + # mutating copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + + # copy of df + copy of subset + # normal copy -> refs are removed, no mutation of parent + # deep=None -> refs are still generated / kept + # copy=False -> refs are kept? But if we then edit it + + # df = ... + # subset = df[1:3] + # subset_shallow_copy = subset.copy(deep=False) + # -> expected behaviour: mutating subset_shallow_copy should mutate subset + # but not mutate parent df + # - if we keep refs -> we copy on setitem -> subset is not mutated + # - if we remove refs -> we don't copy on setitem, but then also parent df + # is mutated + # -> disallow taking a shallow copy of a DataFrame that referencing other arrays? + + +def test_copy_shallow(using_array_manager): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_copy = df.copy(deep=False) + + # the shallow copy still shares memory + assert np.may_share_memory(df_copy["a"].values, df["a"].values) + if using_array_manager: + assert df_copy._mgr.refs is not None + + if using_array_manager: + # mutating shallow copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + # mutating triggered a copy-on-write -> no longer shares memory + assert not np.may_share_memory(df_copy["a"].values, df["a"].values) + # but still shares memory for the other columns + assert np.may_share_memory(df_copy["b"].values, df["b"].values) + else: + # mutating shallow copy does mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 0 + # and still shares memory + assert np.may_share_memory(df_copy["a"].values, df["a"].values) + + +# ----------------------------------------------------------------------------- +# DataFrame methods returning new DataFrame using shallow copy + + +def test_reset_index(using_array_manager): + # Case: resetting the index (i.e. adding a new column) + mutating the + # resulting dataframe + df = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=[10, 11, 12] + ) + df_orig = df.copy() + df2 = df.reset_index() + + if using_array_manager: + # still shares memory (df2 is a shallow copy) + assert np.may_share_memory(df2["b"].values, df["b"].values) + assert np.may_share_memory(df2["c"].values, df["c"].values) + # mutating df2 triggers a copy-on-write for that column + df2.iloc[0, 2] = 0 + assert not np.may_share_memory(df2["b"].values, df["b"].values) + if using_array_manager: + assert np.may_share_memory(df2["c"].values, df["c"].values) + tm.assert_frame_equal(df, df_orig) + + +def test_rename_columns(using_array_manager): + # Case: renaming columns returns a new dataframe + # + afterwards modifying the result + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.rename(columns=str.upper) + + if using_array_manager: + assert np.may_share_memory(df2["A"].values, df["a"].values) + df2.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["A"].values, df["a"].values) + if using_array_manager: + assert np.may_share_memory(df2["C"].values, df["c"].values) + expected = pd.DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_rename_columns_modify_parent(using_array_manager): + # Case: renaming columns returns a new dataframe + # + afterwards modifying the original (parent) dataframe + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df2 = df.rename(columns=str.upper) + df2_orig = df2.copy() + + if using_array_manager: + assert np.may_share_memory(df2["A"].values, df["a"].values) + df.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["A"].values, df["a"].values) + if using_array_manager: + assert np.may_share_memory(df2["C"].values, df["c"].values) + expected = pd.DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df2, df2_orig) + + +def test_reindex_columns(using_array_manager): + # Case: reindexing the column returns a new dataframe + # + afterwards modifying the result + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.reindex(columns=["a", "c"]) + + if using_array_manager: + # still shares memory (df2 is a shallow copy) + assert np.may_share_memory(df2["a"].values, df["a"].values) + else: + assert not np.may_share_memory(df2["a"].values, df["a"].values) + # mutating df2 triggers a copy-on-write for that column + df2.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["a"].values, df["a"].values) + if using_array_manager: + assert np.may_share_memory(df2["c"].values, df["c"].values) + tm.assert_frame_equal(df, df_orig) + + +# ----------------------------------------------------------------------------- +# Indexing operations taking subset + modifying the subset/parent + + +def test_subset_column_selection(using_array_manager): + # Case: taking a subset of the columns of a DataFrame + # + afterwards modifying the subset + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + subset = df[["a", "c"]] + if using_array_manager: + # the subset shares memory ... + assert np.may_share_memory(subset["a"].values, df["a"].values) + # ... but uses CoW when being modified + subset.iloc[0, 0] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.iloc[0, 0] = 0 + + assert not np.may_share_memory(subset["a"].values, df["a"].values) + + expected = pd.DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_subset_column_selection_modify_parent(using_array_manager): + # Case: taking a subset of the columns of a DataFrame + # + afterwards modifying the parent + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + + subset = df[["a", "c"]] + if using_array_manager: + # the subset shares memory ... + assert np.may_share_memory(subset["a"].values, df["a"].values) + # ... but parent uses CoW parent when it is modified + df.iloc[0, 0] = 0 + + assert not np.may_share_memory(subset["a"].values, df["a"].values) + if using_array_manager: + assert np.may_share_memory(subset["c"].values, df["c"].values) + + expected = pd.DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(subset, expected) + + +def test_subset_row_slice(using_array_manager): + # Case: taking a subset of the rows of a DataFrame using a slice + # + afterwards modifying the subset + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + subset = df[1:3] + subset._mgr._verify_integrity() + + assert np.may_share_memory(subset["a"].values, df["a"].values) + + if using_array_manager: + subset.iloc[0, 0] = 0 + assert not np.may_share_memory(subset["a"].values, df["a"].values) + + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.iloc[0, 0] = 0 + + expected = pd.DataFrame( + {"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + if using_array_manager: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.iloc[1, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_subset_column_slice(using_array_manager): + # Case: taking a subset of the columns of a DataFrame using a slice + # + afterwards modifying the subset + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + subset = df.iloc[:, 1:] + subset._mgr._verify_integrity() + + if using_array_manager: + assert np.may_share_memory(subset["b"].values, df["b"].values) + + subset.iloc[0, 0] = 0 + assert not np.may_share_memory(subset["b"].values, df["b"].values) + + else: + subset.iloc[0, 0] = 0 + + expected = pd.DataFrame({"b": [0, 5, 6], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(subset, expected) + # original parent dataframe is not modified (also not for BlockManager case) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "indexer", + [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], + ids=["slice", "mask", "array"], +) +def test_subset_set_with_row_indexer(indexer_si, indexer, using_array_manager): + # Case: setting values with a row indexer on a viewing subset + # subset[indexer] = value and subset.iloc[indexer] = value + df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) + df_orig = df.copy() + subset = df[1:4] + + if ( + indexer_si is tm.setitem + and isinstance(indexer, np.ndarray) + and indexer.dtype == "int" + ): + pytest.skip("setitem with labels selects on columns") + + if using_array_manager: + indexer_si(subset)[indexer] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + indexer_si(subset)[indexer] = 0 + + expected = pd.DataFrame( + {"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4) + ) + tm.assert_frame_equal(subset, expected) + if using_array_manager: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig[1:3] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_with_mask(using_array_manager): + # Case: setting values with a mask on a viewing subset: subset[mask] = value + df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) + df_orig = df.copy() + subset = df[1:4] + + mask = subset > 3 + + if using_array_manager: + subset[mask] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset[mask] = 0 + + expected = pd.DataFrame( + {"a": [2, 3, 0], "b": [0, 0, 0], "c": [0.20, 0.3, 0.4]}, index=range(1, 4) + ) + tm.assert_frame_equal(subset, expected) + if using_array_manager: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.loc[3, "a"] = 0 + df_orig.loc[1:3, "b"] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_column(using_array_manager): + # Case: setting a single column on a viewing subset -> subset[col] = value + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + subset = df[1:3] + + if using_array_manager: + subset["a"] = np.array([10, 11], dtype="int64") + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset["a"] = np.array([10, 11], dtype="int64") + + expected = pd.DataFrame( + {"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_columns_single_block(using_array_manager): + # Case: setting multiple columns on a viewing subset + # -> subset[[col1, col2]] = value + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + df_orig = df.copy() + subset = df[1:3] + + if using_array_manager: + subset[["a", "c"]] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset[["a", "c"]] = 0 + + expected = pd.DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_columns_mixed_block(using_array_manager): + # Case: setting multiple columns on a viewing subset + # -> subset[[col1, col2]] = value + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + subset = df[1:3] + + if using_array_manager: + subset[["a", "c"]] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset[["a", "c"]] = 0 + + expected = pd.DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "indexer", + [slice("a", "b"), np.array([True, True, False]), ["a", "b"]], + ids=["slice", "mask", "array"], +) +def test_subset_set_with_column_indexer(indexer, using_array_manager): + # Case: setting multiple columns with a column indexer on a viewing subset + # -> subset.loc[:, [col1, col2]] = value + df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + df_orig = df.copy() + subset = df[1:3] + + if using_array_manager: + subset.loc[:, indexer] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.loc[:, indexer] = 0 + + expected = pd.DataFrame( + {"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3) + ) + # TODO full row slice .loc[:, idx] update inplace instead of overwrite? + expected["b"] = expected["b"].astype("int64") + tm.assert_frame_equal(subset, expected) + if using_array_manager: + tm.assert_frame_equal(df, df_orig) + else: + # In the mixed case with BlockManager, only one of the two columns is + # mutated in the parent frame .. + df_orig.loc[1:2, ["a"]] = 0 + tm.assert_frame_equal(df, df_orig) + + +# TODO add more tests modifying the parent + +# ----------------------------------------------------------------------------- +# Series -- Indexing operations taking subset + modifying the subset/parent + + +def test_series_getitem_slice(using_array_manager): + # Case: taking a slice of a Series + afterwards modifying the subset + s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + + subset = s[:] + assert np.may_share_memory(subset.values, s.values) + + subset.iloc[0] = 0 + + if using_array_manager: + assert not np.may_share_memory(subset.values, s.values) + + expected = pd.Series([0, 2, 3], index=["a", "b", "c"]) + tm.assert_series_equal(subset, expected) + + if using_array_manager: + # original parent series is not modified (CoW) + tm.assert_series_equal(s, s_orig) + else: + # original parent series is actually updated + assert s.iloc[0] == 0 + + +@pytest.mark.parametrize( + "indexer", + [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], + ids=["slice", "mask", "array"], +) +def test_series_subset_set_with_indexer(indexer_si, indexer, using_array_manager): + # Case: setting values in a viewing Series with an indexer + s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + subset = s[:] + + # if ( + # indexer_si is tm.setitem + # and isinstance(indexer, np.ndarray) + # and indexer.dtype == "int" + # ): + # pytest.skip("setitem with labels selects on columns") + + expected = pd.Series([0, 0, 3], index=["a", "b", "c"]) + indexer_si(subset)[indexer] = 0 + tm.assert_series_equal(subset, expected) + + if using_array_manager: + tm.assert_series_equal(s, s_orig) + else: + tm.assert_series_equal(s, expected) + + expected = pd.DataFrame( + {"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4) + ) + + +# ----------------------------------------------------------------------------- +# del operator + + +def test_del_frame(using_array_manager): + # Case: deleting a column with `del` on a viewing child dataframe should + # not modify parent + update the references + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df[:] + + assert np.may_share_memory(df["a"].values, df2["a"].values) + + del df2["b"] + + assert np.may_share_memory(df["a"].values, df2["a"].values) + tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df2, df_orig[["a", "c"]]) + df2._mgr._verify_integrity() + + # TODO in theory modifying column "b" of the parent wouldn't need a CoW + # but the weakref is still alive and so we still perform CoW + + if using_array_manager: + # modifying child after deleting a column still doesn't update parent + df2.loc[0, "a"] = 100 + tm.assert_frame_equal(df, df_orig) + + +def test_del_series(using_array_manager): + s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + s2 = s[:] + + assert np.may_share_memory(s.values, s2.values) + + del s2["a"] + + assert not np.may_share_memory(s.values, s2.values) + tm.assert_series_equal(s, s_orig) + tm.assert_series_equal(s2, s_orig[["b", "c"]]) + + # modifying s2 doesn't need copy on write (due to `del`, s2 is backed by new array) + values = s2.values + s2.loc["b"] = 100 + assert values[0] == 100 + + +# ----------------------------------------------------------------------------- +# Accessing column as Series + + +def test_column_as_series(using_array_manager): + # Case: selecting a single column now also uses Copy-on-Write + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + s = df["a"] + + assert np.may_share_memory(s.values, df["a"].values) + + if using_array_manager: + s[0] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + s[0] = 0 + + if using_array_manager: + # assert not np.may_share_memory(s.values, df["a"].values) + tm.assert_frame_equal(df, df_orig) + else: + df_orig.iloc[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_column_as_series_set_with_upcast(using_array_manager): + # Case: selecting a single column now also uses Copy-on-Write -> when + # setting a value causes an upcast, we don't need to update the parent + # DataFrame through the cache mechanism + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + s = df["a"] + if using_array_manager: + s[0] = "foo" + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + s[0] = "foo" + + expected = pd.Series(["foo", 2, 3], dtype=object, name="a") + tm.assert_series_equal(s, expected) + if using_array_manager: + tm.assert_frame_equal(df, df_orig) + # ensure cached series on getitem is not the changed series + tm.assert_series_equal(df["a"], df_orig["a"]) + else: + df_orig["a"] = expected + tm.assert_frame_equal(df, df_orig) + + +# TODO add tests for other indexing methods on the Series + + +def test_dataframe_add_column_from_series(): + # Case: adding a new column to a DataFrame from an existing column/series + # -> always already takes a copy on assignment + # (no change in behaviour here) + # TODO can we achieve the same behaviour with Copy-on-Write? + df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + + s = pd.Series([10, 11, 12]) + df["new"] = s + assert not np.may_share_memory(df["new"].values, s.values) + + # editing series -> doesn't modify column in frame + s[0] = 0 + expected = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]}) + tm.assert_frame_equal(df, expected) + + # editing column in frame -> doesn't modify series + df.loc[2, "new"] = 100 + expected = pd.Series([0, 11, 12]) + tm.assert_series_equal(s, expected) + + +# TODO add tests for constructors diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index e088f1ce87a6a..94b82c7bf0e9e 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -80,10 +80,13 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage indexer(df)[key, 0] = cat overwrite = isinstance(key, slice) and key == slice(None) + if using_array_manager and indexer == tm.loc: + # TODO(ArrayManager) with loc, slice(3) gets converted into slice(0, 3) + # which is then considered as "full" slice and does overwrite. For iloc + # this conversion is not done, and so it doesn't overwrite + overwrite = overwrite or (isinstance(key, slice) and key == slice(3)) - if overwrite or using_array_manager: - # TODO(ArrayManager) we always overwrite because ArrayManager takes - # the "split" path, which still overwrites + if overwrite: # TODO: GH#39986 this probably shouldn't behave differently expected = DataFrame({0: cat}) assert not np.shares_memory(df.values, orig_vals) @@ -104,6 +107,8 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage orig_vals = df.values indexer(df)[key, 0] = cat expected = DataFrame({0: cat, 1: range(3)}) + if using_array_manager and not overwrite: + expected[0] = expected[0].astype(object) tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("box", [array, Series]) @@ -853,7 +858,11 @@ def test_identity_slice_returns_new_object(self, using_array_manager, request): # should also be a shallow copy original_series[:3] = [7, 8, 9] - assert all(sliced_series[:3] == [7, 8, 9]) + if using_array_manager: + # shallow copy not updated (CoW) + assert all(sliced_series[:3] == [1, 2, 3]) + else: + assert all(sliced_series[:3] == [7, 8, 9]) def test_indexing_zerodim_np_array(self): # GH24919 @@ -1337,8 +1346,9 @@ def test_frame_iloc_setitem_callable(self): class TestILocSeries: - def test_iloc(self): + def test_iloc(self, using_array_manager): ser = Series(np.random.randn(10), index=list(range(0, 20, 2))) + ser_original = ser.copy() for i in range(len(ser)): result = ser.iloc[i] @@ -1352,7 +1362,10 @@ def test_iloc(self): # test slice is a view result[:] = 0 - assert (ser[1:3] == 0).all() + if using_array_manager: + tm.assert_series_equal(ser, ser_original) + else: + assert (ser[1:3] == 0).all() # list of integers result = ser.iloc[[0, 2, 3, 4, 5]] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index 2a9ee81b7a23a..3a301608b9c06 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -669,29 +669,33 @@ def test_loc_setitem_frame_with_reindex(self, using_array_manager): # setting integer values into a float dataframe with loc is inplace, # so we retain float dtype ser = Series([2, 3, 1], index=[3, 5, 4], dtype=float) - if using_array_manager: - # TODO(ArrayManager) with "split" path, we still overwrite the column - # and therefore don't take the dtype of the underlying object into account - ser = Series([2, 3, 1], index=[3, 5, 4], dtype="int64") expected = DataFrame({"A": ser}) tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_with_reindex_mixed(self): + def test_loc_setitem_frame_with_reindex_mixed(self, using_array_manager): # GH#40480 df = DataFrame(index=[3, 5, 4], columns=["A", "B"], dtype=float) df["B"] = "string" df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") - ser = Series([2, 3, 1], index=[3, 5, 4], dtype="int64") + ser = Series([2, 3, 1], index=[3, 5, 4], dtype=float) + if not using_array_manager: + # when using BlockManager, this takes the "split" path, which + # still overwrites the column + ser = Series([2, 3, 1], index=[3, 5, 4], dtype="int64") expected = DataFrame({"A": ser}) expected["B"] = "string" tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_with_inverted_slice(self): + def test_loc_setitem_frame_with_inverted_slice(self, using_array_manager): # GH#40480 df = DataFrame(index=[1, 2, 3], columns=["A", "B"], dtype=float) df["B"] = "string" df.loc[slice(3, 0, -1), "A"] = np.array([1, 2, 3], dtype="int64") - expected = DataFrame({"A": [3, 2, 1], "B": "string"}, index=[1, 2, 3]) + expected = DataFrame({"A": [3.0, 2.0, 1.0], "B": "string"}, index=[1, 2, 3]) + if not using_array_manager: + # when using BlockManager, this takes the "split" path, which + # still overwrites the column + expected["A"] = expected["A"].astype("int64") tm.assert_frame_equal(df, expected) # TODO(ArrayManager) "split" path overwrites column and therefore don't take @@ -1065,7 +1069,10 @@ def test_identity_slice_returns_new_object(self, using_array_manager, request): assert original_series[:] is not original_series original_series[:3] = [7, 8, 9] - assert all(sliced_series[:3] == [7, 8, 9]) + if using_array_manager: + assert all(sliced_series[:3] == [1, 2, 3]) + else: + assert all(sliced_series[:3] == [7, 8, 9]) @pytest.mark.xfail(reason="accidental fix reverted - GH37497") def test_loc_copy_vs_view(self): diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 95a9fd227c685..bf5a51333c982 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -278,7 +278,13 @@ def test_partial_setting_frame(self): with pytest.raises(IndexError, match=msg): df.iloc[4, 2] = 5.0 - msg = "index 2 is out of bounds for axis 0 with size 2" + msg = "|".join( + [ + "index 2 is out of bounds for axis 0 with size 2", + # TODO(ArrayManager) improve error message + "list index out of range", + ] + ) with pytest.raises(IndexError, match=msg): df.iat[4, 2] = 5.0 diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 48a3ebd25c239..ed1a4371456d7 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -279,17 +279,22 @@ def test_dt_accessor_ambiguous_freq_conversions(self): expected = Series(exp_values, name="xxx") tm.assert_series_equal(ser, expected) - def test_dt_accessor_not_writeable(self): + def test_dt_accessor_not_writeable(self, using_array_manager): # no setting allowed ser = Series(date_range("20130101", periods=5, freq="D"), name="xxx") with pytest.raises(ValueError, match="modifications"): ser.dt.hour = 5 # trying to set a copy - msg = "modifications to a property of a datetimelike.+not supported" - with pd.option_context("chained_assignment", "raise"): - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: + # TODO(CoW) it would be nice to keep a warning/error for this case + with pd.option_context("chained_assignment", "raise"): ser.dt.hour[0] = 5 + else: + msg = "modifications to a property of a datetimelike.+not supported" + with pd.option_context("chained_assignment", "raise"): + with pytest.raises(com.SettingWithCopyError, match=msg): + ser.dt.hour[0] = 5 @pytest.mark.parametrize( "method, dates", diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 8a34882b1e5d4..1f20d15189384 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -5,6 +5,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, IndexSlice, @@ -204,7 +206,8 @@ def test_basic_getitem_setitem_corner(datetime_series): datetime_series[[5, slice(None, None)]] = 2 -def test_slice(string_series, object_series): +def test_slice(string_series, object_series, using_array_manager): + original = string_series.copy() numSlice = string_series[10:20] numSliceEnd = string_series[-10:] objSlice = object_series[10:20] @@ -222,7 +225,11 @@ def test_slice(string_series, object_series): sl = string_series[10:20] sl[:] = 0 - assert (string_series[10:20] == 0).all() + if using_array_manager: + # Doesn't modify parent (CoW) + tm.assert_series_equal(string_series, original) + else: + assert (string_series[10:20] == 0).all() def test_timedelta_assignment(): @@ -239,6 +246,8 @@ def test_timedelta_assignment(): tm.assert_series_equal(s, expected) +# TODO(CoW) what should the update method do? -> deprecate this? +@td.skip_array_manager_not_yet_implemented def test_underlying_data_conversion(): # GH 4080 df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]}) diff --git a/pandas/tests/series/methods/test_copy.py b/pandas/tests/series/methods/test_copy.py index 8aa5c14812dc0..26966592d563b 100644 --- a/pandas/tests/series/methods/test_copy.py +++ b/pandas/tests/series/methods/test_copy.py @@ -9,20 +9,28 @@ class TestCopy: - @pytest.mark.parametrize("deep", [None, False, True]) - def test_copy(self, deep): + @pytest.mark.parametrize("deep", ["default", None, False, True]) + def test_copy(self, deep, using_array_manager): ser = Series(np.arange(10), dtype="float64") # default deep is True - if deep is None: + if deep == "default": ser2 = ser.copy() else: ser2 = ser.copy(deep=deep) + if using_array_manager: + # INFO(ArrayManager) a shallow copy doesn't yet copy the data + # but parent will not be modified (CoW) + if deep is None or deep is False: + assert np.may_share_memory(ser.values, ser2.values) + else: + assert not np.may_share_memory(ser.values, ser2.values) + ser2[::2] = np.NaN - if deep is None or deep is True: + if deep == "default" or deep is None or deep is True or using_array_manager: # Did not modify original Series assert np.isnan(ser2[0]) assert not np.isnan(ser[0]) @@ -31,8 +39,8 @@ def test_copy(self, deep): assert np.isnan(ser2[0]) assert np.isnan(ser[0]) - @pytest.mark.parametrize("deep", [None, False, True]) - def test_copy_tzaware(self, deep): + @pytest.mark.parametrize("deep", ["default", None, False, True]) + def test_copy_tzaware(self, deep, using_array_manager): # GH#11794 # copy of tz-aware expected = Series([Timestamp("2012/01/01", tz="UTC")]) @@ -40,15 +48,23 @@ def test_copy_tzaware(self, deep): ser = Series([Timestamp("2012/01/01", tz="UTC")]) - if deep is None: + if deep == "default": ser2 = ser.copy() else: ser2 = ser.copy(deep=deep) + if using_array_manager: + # INFO(ArrayManager) a shallow copy doesn't yet copy the data + # but parent will not be modified (CoW) + if deep is None or deep is False: + assert np.may_share_memory(ser.values, ser2.values) + else: + assert not np.may_share_memory(ser.values, ser2.values) + ser2[0] = Timestamp("1999/01/01", tz="UTC") # default deep is True - if deep is None or deep is True: + if deep == "default" or deep is None or deep is True or using_array_manager: # Did not modify original Series tm.assert_series_equal(ser2, expected2) tm.assert_series_equal(ser, expected) diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index d9d6641d54237..17d05711ffefe 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -14,6 +14,9 @@ class TestUpdate: + + # TODO(CoW) what should the update method do? -> deprecate this? + @td.skip_array_manager_not_yet_implemented def test_update(self): s = Series([1.5, np.nan, 3.0, 4.0, np.nan]) s2 = Series([np.nan, 3.5, np.nan, 5.0])