From 462526e71525ca4f45ae517a8ad9eac3d7a16c11 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 29 Apr 2021 10:59:38 +0200 Subject: [PATCH 01/21] POC for Copy-on-Write --- pandas/core/frame.py | 4 +- pandas/core/generic.py | 6 +- pandas/core/indexing.py | 6 + pandas/core/internals/array_manager.py | 64 +++- pandas/core/internals/managers.py | 10 +- pandas/core/series.py | 5 +- pandas/tests/indexing/test_copy_on_write.py | 404 ++++++++++++++++++++ 7 files changed, 484 insertions(+), 15 deletions(-) create mode 100644 pandas/tests/indexing/test_copy_on_write.py diff --git a/pandas/core/frame.py b/pandas/core/frame.py index cde9364b55cb2..46a25913e1371 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4930,7 +4930,7 @@ def rename( index: Renamer | None = None, columns: Renamer | None = None, axis: Axis | None = None, - copy: bool = True, + copy: bool | None = None, inplace: bool = False, level: Level | None = None, errors: str = "ignore", @@ -5764,7 +5764,7 @@ class max type if inplace: new_obj = self else: - new_obj = self.copy() + new_obj = self.copy(deep=None) new_index = ibase.default_index(len(new_obj)) if level is not None: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9d44ea947e562..f6a5fef294544 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -992,7 +992,7 @@ def rename( index: Renamer | None = None, columns: Renamer | None = None, axis: Axis | None = None, - copy: bool_t = True, + copy: bool_t | None = None, inplace: bool_t = False, level: Level | None = None, errors: str = "ignore", @@ -3911,6 +3911,8 @@ def _check_setitem_copy(self, stacklevel=4, t="setting", force=False): df.iloc[0:5]['group'] = 'a' """ + if isinstance(self._mgr, (ArrayManager, SingleArrayManager)): + return # return early if the check is not needed if not (force or self._is_copy): return @@ -5833,7 +5835,7 @@ def astype( return result @final - def copy(self: FrameOrSeries, deep: bool_t = True) -> FrameOrSeries: + def copy(self: FrameOrSeries, deep: bool_t | None = True) -> FrameOrSeries: """ Make a copy of this object's indices and data. diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 4f55459040bc0..e6ae642fcb495 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1869,6 +1869,12 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): """ pi = plane_indexer + if not getattr(self.obj._mgr, "blocks", False): + # ArrayManager + self.obj._mgr.column_setitem(loc, plane_indexer, value) + self.obj._clear_item_cache() + return + ser = self.obj._ixs(loc, axis=1) # perform the equivalent of a setitem on the info axis diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 31e32b053367b..e4b95208ecad6 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -9,6 +9,7 @@ Callable, TypeVar, ) +import weakref import numpy as np @@ -121,11 +122,13 @@ class BaseArrayManager(DataManager): arrays: list[np.ndarray | ExtensionArray] _axes: list[Index] + refs: list | None def __init__( self, arrays: list[np.ndarray | ExtensionArray], axes: list[Index], + refs: list | None, verify_integrity: bool = True, ): raise NotImplementedError @@ -176,6 +179,15 @@ def is_consolidated(self) -> bool: def _consolidate_inplace(self) -> None: pass + def _has_no_reference(self, i: int): + return (self.refs is None or self.refs[i] is None) and weakref.getweakrefcount( + self.arrays[i] + ) == 0 + + def _clear_reference(self, i: int): + if self.refs is not None: + self.refs[i] = None + def get_dtypes(self): return np.array([arr.dtype for arr in self.arrays], dtype="object") @@ -516,6 +528,12 @@ def copy(self: T, deep=True) -> T: ------- BlockManager """ + if deep is not True: + refs = [weakref.ref(arr) for arr in self.arrays] + return type(self)( + list(self.arrays), list(self._axes), refs, verify_integrity=False + ) + # this preserves the notion of view copying of axes if deep: # hit in e.g. tests.io.json.test_pandas @@ -530,7 +548,11 @@ def copy_func(ax): if deep: new_arrays = [arr.copy() for arr in self.arrays] else: - new_arrays = self.arrays + new_arrays = list(self.arrays) + if deep: + refs = None + else: + refs = list(self.refs) if isinstance(self.refs, list) else self.refs return type(self)(new_arrays, new_axes) def reindex_indexer( @@ -599,14 +621,18 @@ def _reindex_indexer( if axis == 1: new_arrays = [] + refs = [] for i in indexer: if i == -1: arr = self._make_na_array( fill_value=fill_value, use_na_proxy=use_na_proxy ) + ref = None else: arr = self.arrays[i] + ref = weakref.ref(arr) new_arrays.append(arr) + refs.append(ref) else: validate_indices(indexer, len(self._axes[0])) @@ -625,11 +651,12 @@ def _reindex_indexer( ) for arr in self.arrays ] + refs = None new_axes = list(self._axes) new_axes[axis] = new_axis - return type(self)(new_arrays, new_axes, verify_integrity=False) + return type(self)(new_arrays, new_axes, refs, verify_integrity=False) def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: """ @@ -693,12 +720,14 @@ def __init__( self, arrays: list[np.ndarray | ExtensionArray], axes: list[Index], + refs: list | None = None, verify_integrity: bool = True, ): # Note: we are storing the axes in "_axes" in the (row, columns) order # which contrasts the order how it is stored in BlockManager self._axes = axes self.arrays = arrays + self.refs = refs if verify_integrity: self._axes = [ensure_index(ax) for ax in axes] @@ -768,15 +797,19 @@ def get_slice(self, slobj: slice, axis: int = 0) -> ArrayManager: new_axes = list(self._axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) + # TODO possible optimization is to have `refs` to be a weakref to the + # full ArrayManager to indicate it's referencing + refs = [weakref.ref(arr) for arr in self.arrays] - return type(self)(arrays, new_axes, verify_integrity=False) + return type(self)(arrays, new_axes, refs, verify_integrity=False) def iget(self, i: int) -> SingleArrayManager: """ Return the data as a SingleArrayManager. """ values = self.arrays[i] - return SingleArrayManager([values], [self._axes[0]]) + ref = weakref.ref(values) + return SingleArrayManager([values], [self._axes[0]], [ref]) def iget_values(self, i: int) -> ArrayLike: """ @@ -882,6 +915,8 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: # TODO is this copy needed? arrays = self.arrays.copy() arrays.insert(loc, value) + if self.refs is not None: + self.refs.insert(loc, None) self.arrays = arrays self._axes[1] = new_axis @@ -897,6 +932,15 @@ def idelete(self, indexer): self._axes = [self._axes[0], self._axes[1][to_keep]] return self + def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value): + if self._has_no_reference(loc): + self.arrays[loc][idx] = value + else: + arr = self.arrays[loc].copy() + arr[idx] = value + self.arrays[loc] = arr + self._clear_reference(loc) + # -------------------------------------------------------------------- # Array-wise Operation @@ -1179,10 +1223,12 @@ def __init__( self, arrays: list[np.ndarray | ExtensionArray], axes: list[Index], + refs: list | None = None, verify_integrity: bool = True, ): self._axes = axes self.arrays = arrays + self.refs = refs if verify_integrity: assert len(axes) == 1 @@ -1286,8 +1332,14 @@ def apply(self, func, **kwargs): new_array = getattr(self.array, func)(**kwargs) return type(self)([new_array], self._axes) - def setitem(self, indexer, value): - return self.apply_with_block("setitem", indexer=indexer, value=value) + def setitem(self, indexer, value, inplace=False): + if not self._has_no_reference(0): + self.arrays[0] = self.arrays[0].copy() + self._clear_reference(0) + if inplace: + self.arrays[0][indexer] = value + else: + return self.apply_with_block("setitem", indexer=indexer, value=value) def idelete(self, indexer) -> SingleArrayManager: """ diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 323aa45874d96..a7a980b280e4c 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -351,8 +351,12 @@ def where(self: T, other, cond, align: bool, errors: str) -> T: errors=errors, ) - def setitem(self: T, indexer, value) -> T: - return self.apply("setitem", indexer=indexer, value=value) + def setitem(self: T, indexer, value, inplace=False) -> T: + if inplace: + assert self.ndim == 1 + self._block.values[indexer] = value + else: + return self.apply("setitem", indexer=indexer, value=value) def putmask(self, mask, new, align: bool = True): @@ -562,6 +566,8 @@ def copy(self: T, deep=True) -> T: ------- BlockManager """ + if deep is None: + deep = True # this preserves the notion of view copying of axes if deep: # hit in e.g. tests.io.json.test_pandas diff --git a/pandas/core/series.py b/pandas/core/series.py index 2f45a2adbdec7..e78f54c361fed 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1064,10 +1064,9 @@ def __setitem__(self, key, value) -> None: try: self._set_with_engine(key, value) except (KeyError, ValueError): - values = self._values if is_integer(key) and self.index.inferred_type != "integer": # positional setter - values[key] = value + self._mgr.setitem(key, value, inplace=True) else: # GH#12862 adding a new key to the Series self.loc[key] = value @@ -1099,7 +1098,7 @@ def _set_with_engine(self, key, value) -> None: # error: Argument 1 to "validate_numeric_casting" has incompatible type # "Union[dtype, ExtensionDtype]"; expected "dtype" validate_numeric_casting(self.dtype, value) # type: ignore[arg-type] - self._values[loc] = value + self._mgr.setitem(loc, value, inplace=True) def _set_with(self, key, value): # other: fancy integer or otherwise diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py new file mode 100644 index 0000000000000..034d53c9e31c6 --- /dev/null +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -0,0 +1,404 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +import pandas.core.common as com + + +def test_copy(using_array_manager): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_copy = df.copy() + + # the deep copy doesn't share memory + assert not np.may_share_memory(df_copy["a"].values, df["a"].values) + if using_array_manager: + assert df_copy._mgr.refs is None + + # mutating copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + + # copy of df + copy of subset + # normal copy -> refs are removed, no mutation of parent + # deep=None -> refs are still generated / kept + # copy=False -> refs are kept? But if we then edit it + + # df = ... + # subset = df[1:3] + # subset_shallow_copy = subset.copy(deep=False) + # -> expected behaviour: mutating subset_shallow_copy should mutate subset + # but not mutate parent df + # - if we keep refs -> we copy on setitem -> subset is not mutated + # - if we remove refs -> we don't copy on setitem, but then also parent df + # is mutated + # -> disallow taking a shallow copy of a DataFrame that referencing other arrays? + + +def test_copy_shallow(using_array_manager): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_copy = df.copy(deep=False) + + # the shallow copy still shares memory + assert np.may_share_memory(df_copy["a"].values, df["a"].values) + if using_array_manager: + assert df_copy._mgr.refs is not None + + if using_array_manager: + # mutating shallow copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + # mutating triggered a copy-on-write -> no longer shares memory + assert not np.may_share_memory(df_copy["a"].values, df["a"].values) + # but still shares memory for the other columns + assert np.may_share_memory(df_copy["b"].values, df["b"].values) + else: + # mutating shallow copy does mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 0 + # and still shares memory + assert np.may_share_memory(df_copy["a"].values, df["a"].values) + + +# ----------------------------------------------------------------------------- +# DataFrame methods returning new DataFrame using shallow copy + + +def test_reset_index(using_array_manager): + # Case: resetting the index (i.e. adding a new column) + mutating the + # resulting dataframe + df = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=[10, 11, 12] + ) + df_orig = df.copy() + df2 = df.reset_index() + + if using_array_manager: + # still shares memory (df2 is a shallow copy) + assert np.may_share_memory(df2["b"].values, df["b"].values) + assert np.may_share_memory(df2["c"].values, df["c"].values) + # mutating df2 triggers a copy-on-write for that column + df2.iloc[0, 2] = 0 + assert not np.may_share_memory(df2["b"].values, df["b"].values) + if using_array_manager: + assert np.may_share_memory(df2["c"].values, df["c"].values) + tm.assert_frame_equal(df, df_orig) + + +def test_rename_columns(using_array_manager): + # Case: renaming columns returns a new dataframe + # + afterwards modifying the result + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.rename(columns=str.upper) + + if using_array_manager: + assert np.may_share_memory(df2["A"].values, df["a"].values) + df2.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["A"].values, df["a"].values) + if using_array_manager: + assert np.may_share_memory(df2["C"].values, df["c"].values) + expected = pd.DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_rename_columns_modify_parent(using_array_manager): + # Case: renaming columns returns a new dataframe + # + afterwards modifying the original (parent) dataframe + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df2 = df.rename(columns=str.upper) + df2_orig = df2.copy() + + if using_array_manager: + assert np.may_share_memory(df2["A"].values, df["a"].values) + df.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["A"].values, df["a"].values) + if using_array_manager: + assert np.may_share_memory(df2["C"].values, df["c"].values) + expected = pd.DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df2, df2_orig) + + +# ----------------------------------------------------------------------------- +# Indexing operations taking subset + modifying the subset/parent + + +def test_subset_column_selection(using_array_manager): + # Case: taking a subset of the columns of a DataFrame + # + afterwards modifying the subset + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + subset = df[["a", "c"]] + if using_array_manager: + # the subset shares memory ... + assert np.may_share_memory(subset["a"].values, df["a"].values) + # ... but uses CoW when being modified + subset.iloc[0, 0] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.iloc[0, 0] = 0 + + assert not np.may_share_memory(subset["a"].values, df["a"].values) + + expected = pd.DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_subset_column_selection_modify_parent(using_array_manager): + # Case: taking a subset of the columns of a DataFrame + # + afterwards modifying the parent + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + + subset = df[["a", "c"]] + if using_array_manager: + # the subset shares memory ... + assert np.may_share_memory(subset["a"].values, df["a"].values) + # ... but parent uses CoW parent when it is modified + df.iloc[0, 0] = 0 + + assert not np.may_share_memory(subset["a"].values, df["a"].values) + if using_array_manager: + assert np.may_share_memory(subset["c"].values, df["c"].values) + + expected = pd.DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(subset, expected) + + +def test_subset_row_slice(using_array_manager): + # Case: taking a subset of the rows of a DataFrame using a slice + # + afterwards modifying the subset + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + subset = df[1:3] + assert np.may_share_memory(subset["a"].values, df["a"].values) + + if using_array_manager: + subset.iloc[0, 0] = 0 + assert not np.may_share_memory(subset["a"].values, df["a"].values) + + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.iloc[0, 0] = 0 + + expected = pd.DataFrame( + {"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + if using_array_manager: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.iloc[1, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "indexer", + [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], + ids=["slice", "mask", "array"], +) +def test_subset_set_with_row_indexer(indexer_si, indexer, using_array_manager): + # Case: setting values with a row indexer on a viewing subset + # subset[indexer] = value and subset.iloc[indexer] = value + df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) + df_orig = df.copy() + subset = df[1:4] + + if ( + indexer_si is tm.setitem + and isinstance(indexer, np.ndarray) + and indexer.dtype == "int" + ): + pytest.skip("setitem with labels selects on columns") + + if using_array_manager: + indexer_si(subset)[indexer] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + indexer_si(subset)[indexer] = 0 + + expected = pd.DataFrame( + {"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4) + ) + tm.assert_frame_equal(subset, expected) + if using_array_manager: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig[1:3] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_column(using_array_manager): + # Case: setting a single column on a viewing subset -> subset[col] = value + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + subset = df[1:3] + + if using_array_manager: + subset["a"] = np.array([10, 11]) + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset["a"] = np.array([10, 11]) + + expected = pd.DataFrame( + {"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + if using_array_manager: + tm.assert_frame_equal(df, df_orig) + else: + df_orig.loc[1:2, "a"] = np.array([10, 11]) + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_columns_single_block(using_array_manager): + # Case: setting multiple columns on a viewing subset + # -> subset[[col1, col2]] = value + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}) + df_orig = df.copy() + subset = df[1:3] + + if using_array_manager: + subset[["a", "c"]] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset[["a", "c"]] = 0 + + expected = pd.DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) + tm.assert_frame_equal(subset, expected) + if using_array_manager: + tm.assert_frame_equal(df, df_orig) + else: + df_orig.loc[1:2, ["a", "c"]] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_columns_mixed_block(using_array_manager): + # Case: setting multiple columns on a viewing subset + # -> subset[[col1, col2]] = value + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + subset = df[1:3] + + if using_array_manager: + subset[["a", "c"]] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset[["a", "c"]] = 0 + + expected = pd.DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) + tm.assert_frame_equal(subset, expected) + if using_array_manager: + tm.assert_frame_equal(df, df_orig) + else: + # In the mixed case with BlockManager, only one of the two columns is + # mutated in the parent frame .. + df_orig.loc[1:2, ["a"]] = 0 + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "indexer", + [slice("a", "b"), np.array([True, True, False]), ["a", "b"]], + ids=["slice", "mask", "array"], +) +def test_subset_set_with_column_indexer(indexer, using_array_manager): + # Case: setting multiple columns with a column indexer on a viewing subset + # -> subset.loc[:, [col1, col2]] = value + df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + df_orig = df.copy() + subset = df[1:3] + + if using_array_manager: + subset.loc[:, indexer] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.loc[:, indexer] = 0 + + expected = pd.DataFrame( + {"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3) + ) + if not using_array_manager: + expected["b"] = expected["b"].astype("int64") + tm.assert_frame_equal(subset, expected) + if using_array_manager: + tm.assert_frame_equal(df, df_orig) + else: + # In the mixed case with BlockManager, only one of the two columns is + # mutated in the parent frame .. + df_orig.loc[1:2, ["a"]] = 0 + tm.assert_frame_equal(df, df_orig) + + +# TODO add more tests modifying the parent + + +# ----------------------------------------------------------------------------- +# Accessing column as Series + + +def test_column_as_series(using_array_manager): + # Case: selecting a single column now also uses Copy-on-Write + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + s = df["a"] + + assert np.may_share_memory(s.values, df["a"].values) + + if using_array_manager: + s[0] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + s[0] = 0 + + if using_array_manager: + # assert not np.may_share_memory(s.values, df["a"].values) + tm.assert_frame_equal(df, df_orig) + else: + df_orig.iloc[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + +# TODO add tests for other indexing methods on the Series + + +def test_dataframe_add_column_from_series(): + # Case: adding a new column to a DataFrame from an existing column/series + # -> always already takes a copy on assignment + # (no change in behaviour here) + # TODO can we achieve the same behaviour with Copy-on-Write? + df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + + s = pd.Series([10, 11, 12]) + df["new"] = s + assert not np.may_share_memory(df["new"].values, s.values) + + # editing series -> doesn't modify column in frame + s[0] = 0 + expected = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]}) + tm.assert_frame_equal(df, expected) + + # editing column in frame -> doesn't modify series + df.loc[2, "new"] = 100 + expected = pd.Series([0, 11, 12]) + tm.assert_series_equal(s, expected) + + +# TODO add tests for constructors From 41ee2b77013d031813de45ccb0fbaaee679ff01b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 8 Jun 2021 20:37:41 +0200 Subject: [PATCH 02/21] clean-up ArrayManager.copy implementation --- pandas/core/internals/array_manager.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index e4b95208ecad6..6c081ad0fb092 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -528,11 +528,9 @@ def copy(self: T, deep=True) -> T: ------- BlockManager """ - if deep is not True: - refs = [weakref.ref(arr) for arr in self.arrays] - return type(self)( - list(self.arrays), list(self._axes), refs, verify_integrity=False - ) + if deep is None: + # use shallow copy + deep = False # this preserves the notion of view copying of axes if deep: @@ -547,13 +545,12 @@ def copy_func(ax): if deep: new_arrays = [arr.copy() for arr in self.arrays] - else: - new_arrays = list(self.arrays) - if deep: refs = None else: - refs = list(self.refs) if isinstance(self.refs, list) else self.refs - return type(self)(new_arrays, new_axes) + new_arrays = list(self.arrays) + refs = [weakref.ref(arr) for arr in self.arrays] + + return type(self)(new_arrays, new_axes, refs, verify_integrity=False) def reindex_indexer( self: T, From 7a8dffc9a30b0dae3096969bd3102a118e9ac5dd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 8 Jun 2021 21:34:18 +0200 Subject: [PATCH 03/21] add some comments / docstrings --- pandas/core/internals/array_manager.py | 35 +++++++++++++++++++++----- pandas/core/internals/managers.py | 1 + 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 6c081ad0fb092..08adc3fb48f30 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -111,6 +111,7 @@ class BaseArrayManager(DataManager): ---------- arrays : Sequence of arrays axes : Sequence of Index + refs : Sequence of weakrefs or None, optional verify_integrity : bool, default True """ @@ -122,13 +123,13 @@ class BaseArrayManager(DataManager): arrays: list[np.ndarray | ExtensionArray] _axes: list[Index] - refs: list | None + refs: list[weakref.ref | None] | None def __init__( self, arrays: list[np.ndarray | ExtensionArray], axes: list[Index], - refs: list | None, + refs: list[weakref.ref | None] | None, verify_integrity: bool = True, ): raise NotImplementedError @@ -180,11 +181,20 @@ def _consolidate_inplace(self) -> None: pass def _has_no_reference(self, i: int): + """ + Check for column `i` if has references. + (whether it references another array or is itself being referenced) + + Returns True if the columns has no references. + """ return (self.refs is None or self.refs[i] is None) and weakref.getweakrefcount( self.arrays[i] ) == 0 def _clear_reference(self, i: int): + """ + Clear any reference for column `i`. + """ if self.refs is not None: self.refs[i] = None @@ -626,6 +636,7 @@ def _reindex_indexer( ) ref = None else: + # reusing full column array -> track with reference arr = self.arrays[i] ref = weakref.ref(arr) new_arrays.append(arr) @@ -648,6 +659,8 @@ def _reindex_indexer( ) for arr in self.arrays ] + # selecting rows with take always creates a copy -> no need to + # track references to original arrays refs = None new_axes = list(self._axes) @@ -717,7 +730,7 @@ def __init__( self, arrays: list[np.ndarray | ExtensionArray], axes: list[Index], - refs: list | None = None, + refs: list[weakref.ref | None] | None = None, verify_integrity: bool = True, ): # Note: we are storing the axes in "_axes" in the (row, columns) order @@ -794,8 +807,8 @@ def get_slice(self, slobj: slice, axis: int = 0) -> ArrayManager: new_axes = list(self._axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - # TODO possible optimization is to have `refs` to be a weakref to the - # full ArrayManager to indicate it's referencing + # slicing results in views -> track references to original arrays + # TODO possible to optimizate this with single ref to the full ArrayManager? refs = [weakref.ref(arr) for arr in self.arrays] return type(self)(arrays, new_axes, refs, verify_integrity=False) @@ -805,6 +818,7 @@ def iget(self, i: int) -> SingleArrayManager: Return the data as a SingleArrayManager. """ values = self.arrays[i] + # getting single column array for Series -> track reference to original ref = weakref.ref(values) return SingleArrayManager([values], [self._axes[0]], [ref]) @@ -834,6 +848,7 @@ def iset(self, loc: int | slice | np.ndarray, value: ArrayLike): Positional location (already bounds checked) value : np.ndarray or ExtensionArray """ + # TODO clear reference for item that is being overwritten # single column -> single integer index if lib.is_integer(loc): @@ -913,6 +928,8 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: arrays = self.arrays.copy() arrays.insert(loc, value) if self.refs is not None: + # inserted `value` is already a copy, no need to track reference + # TODO can we use CoW here as well? self.refs.insert(loc, None) self.arrays = arrays @@ -922,6 +939,7 @@ def idelete(self, indexer): """ Delete selected locations in-place (new block and array, same BlockManager) """ + # TODO update refs to_keep = np.ones(self.shape[0], dtype=np.bool_) to_keep[indexer] = False @@ -931,8 +949,10 @@ def idelete(self, indexer): def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value): if self._has_no_reference(loc): + # if no reference -> set array inplace self.arrays[loc][idx] = value else: + # otherwise perform Copy-on-Write and clear the reference arr = self.arrays[loc].copy() arr[idx] = value self.arrays[loc] = arr @@ -1220,7 +1240,7 @@ def __init__( self, arrays: list[np.ndarray | ExtensionArray], axes: list[Index], - refs: list | None = None, + refs: list[weakref.ref | None] | None = None, verify_integrity: bool = True, ): self._axes = axes @@ -1310,6 +1330,7 @@ def fast_xs(self, loc: int) -> ArrayLike: raise NotImplementedError("Use series._values[loc] instead") def get_slice(self, slobj: slice, axis: int = 0) -> SingleArrayManager: + # TODO track reference if axis >= self.ndim: raise IndexError("Requested axis not found in manager") @@ -1331,6 +1352,7 @@ def apply(self, func, **kwargs): def setitem(self, indexer, value, inplace=False): if not self._has_no_reference(0): + # if being referenced -> perform Copy-on-Write and clear the reference self.arrays[0] = self.arrays[0].copy() self._clear_reference(0) if inplace: @@ -1342,6 +1364,7 @@ def idelete(self, indexer) -> SingleArrayManager: """ Delete selected locations in-place (new array, same ArrayManager) """ + # TODO clear reference to_keep = np.ones(self.shape[0], dtype=np.bool_) to_keep[indexer] = False diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index a7a980b280e4c..f3ebf8f050821 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -567,6 +567,7 @@ def copy(self: T, deep=True) -> T: BlockManager """ if deep is None: + # preserve deep copy for BlockManager with copy=None deep = True # this preserves the notion of view copying of axes if deep: From 96b6d71855b7b566cb7effd40ed176f3035fbc53 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 25 Jun 2021 09:53:59 +0200 Subject: [PATCH 04/21] fix series slice + del operator --- pandas/core/indexing.py | 2 +- pandas/core/internals/array_manager.py | 15 ++++- pandas/tests/indexing/test_copy_on_write.py | 75 +++++++++++++++++++++ 3 files changed, 88 insertions(+), 4 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index ddaad65bdaff8..01cb064b7aa47 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1894,7 +1894,7 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): """ pi = plane_indexer - if not getattr(self.obj._mgr, "blocks", False): + if not hasattr(self.obj._mgr, "blocks"): # ArrayManager self.obj._mgr.column_setitem(loc, plane_indexer, value) self.obj._clear_item_cache() diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 1fc2ca5251127..0e58996aebd42 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -767,6 +767,12 @@ def _verify_integrity(self) -> None: "Passed arrays should be 1-dimensional, got array with " f"{arr.ndim} dimensions instead." ) + if self.refs is not None: + if not len(self.refs) == n_columns: + raise ValueError( + "Number of passed refs must equal the size of the column Index: " + f"{len(self.refs)} refs vs {n_columns} columns." + ) # -------------------------------------------------------------------- # Indexing @@ -937,12 +943,12 @@ def idelete(self, indexer): """ Delete selected locations in-place (new block and array, same BlockManager) """ - # TODO update refs to_keep = np.ones(self.shape[0], dtype=np.bool_) to_keep[indexer] = False self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]] self._axes = [self._axes[0], self._axes[1][to_keep]] + self.refs = [ref for i, ref in enumerate(self.refs) if to_keep[i]] return self def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value): @@ -1339,7 +1345,9 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleArrayManager: def getitem_mgr(self, indexer) -> SingleArrayManager: new_array = self.array[indexer] new_index = self.index[indexer] - return type(self)([new_array], [new_index]) + # TODO in theory only need to track reference if new_array is a view + ref = weakref.ref(self.array) + return type(self)([new_array], [new_index], [ref]) def apply(self, func, **kwargs): if callable(func): @@ -1362,12 +1370,13 @@ def idelete(self, indexer) -> SingleArrayManager: """ Delete selected locations in-place (new array, same ArrayManager) """ - # TODO clear reference to_keep = np.ones(self.shape[0], dtype=np.bool_) to_keep[indexer] = False self.arrays = [self.arrays[0][to_keep]] self._axes = [self._axes[0][to_keep]] + # clear reference since we are backed by new array + self.refs = None return self def _get_data_subset(self, predicate: Callable) -> SingleArrayManager: diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py index 034d53c9e31c6..7994905f9125b 100644 --- a/pandas/tests/indexing/test_copy_on_write.py +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -347,6 +347,81 @@ def test_subset_set_with_column_indexer(indexer, using_array_manager): # TODO add more tests modifying the parent +# ----------------------------------------------------------------------------- +# Series -- Indexing operations taking subset + modifying the subset/parent + + +def test_series_getitem_slice(using_array_manager): + # Case: taking a slice of a Series + afterwards modifying the subset + s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + + subset = s[:] + assert np.may_share_memory(subset.values, s.values) + + subset.iloc[0] = 0 + + if using_array_manager: + assert not np.may_share_memory(subset.values, s.values) + + expected = pd.Series([0, 2, 3], index=["a", "b", "c"]) + tm.assert_series_equal(subset, expected) + + if using_array_manager: + # original parent series is not modified (CoW) + tm.assert_series_equal(s, s_orig) + else: + # original parent series is actually updated + assert s.iloc[0] == 0 + + +# ----------------------------------------------------------------------------- +# del operator + + +def test_del_frame(using_array_manager): + # Case: deleting a column with `del` on a viewing child dataframe should + # not modify parent + update the references + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df[:] + + assert np.may_share_memory(df["a"].values, df2["a"].values) + + del df2["b"] + + assert np.may_share_memory(df["a"].values, df2["a"].values) + tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df2, df_orig[["a", "c"]]) + df2._mgr._verify_integrity() + + # TODO in theory modifying column "b" of the parent wouldn't need a CoW + # but the weakref is still alive and so we still perform CoW + + if using_array_manager: + # modifying child after deleting a column still doesn't update parent + df2.loc[0, "a"] = 100 + tm.assert_frame_equal(df, df_orig) + + +def test_del_series(using_array_manager): + s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + s2 = s[:] + + assert np.may_share_memory(s.values, s2.values) + + del s2["a"] + + assert not np.may_share_memory(s.values, s2.values) + tm.assert_series_equal(s, s_orig) + tm.assert_series_equal(s2, s_orig[["b", "c"]]) + + # modifying s2 doesn't need copy on write (due to `del`, s2 is backed by new array) + values = s2.values + s2.loc["b"] = 100 + assert values[0] == 100 + # ----------------------------------------------------------------------------- # Accessing column as Series From 17cedb97e92a290e30fb4b80802c91f7c18e53b5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 25 Jun 2021 13:19:19 +0200 Subject: [PATCH 05/21] fix pickle and column_setitem with dtype changes --- pandas/core/internals/array_manager.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 0e58996aebd42..f169b32a0f086 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -203,6 +203,10 @@ def get_dtypes(self): # TODO setstate getstate + def __reduce__(self): + # to avoid pickling the refs + return type(self), (self.arrays, self._axes) + def __repr__(self) -> str: output = type(self).__name__ output += f"\nIndex: {self._axes[0]}" @@ -948,20 +952,25 @@ def idelete(self, indexer): self.arrays = [self.arrays[i] for i in np.nonzero(to_keep)[0]] self._axes = [self._axes[0], self._axes[1][to_keep]] - self.refs = [ref for i, ref in enumerate(self.refs) if to_keep[i]] + if self.refs is not None: + self.refs = [ref for i, ref in enumerate(self.refs) if to_keep[i]] return self def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value): if self._has_no_reference(loc): - # if no reference -> set array inplace - self.arrays[loc][idx] = value + # if no reference -> set array (potentially) inplace + arr = self.arrays[loc] else: # otherwise perform Copy-on-Write and clear the reference arr = self.arrays[loc].copy() - arr[idx] = value - self.arrays[loc] = arr self._clear_reference(loc) + # create temporary SingleArrayManager without ref to use setitem implementation + mgr = SingleArrayManager([arr], [self._axes[0]]) + new_mgr = mgr.setitem(idx, value) + # update existing ArrayManager in-place + self.arrays[loc] = new_mgr.arrays[0] + # -------------------------------------------------------------------- # Array-wise Operation From f4614c2648a26b8b80982418e0d2bf1e9c1e4047 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 25 Jun 2021 19:56:35 +0200 Subject: [PATCH 06/21] fix setitem on single column for full slice + update frame tests --- pandas/core/indexing.py | 6 +++- pandas/core/internals/array_manager.py | 5 ++- pandas/tests/frame/indexing/test_indexing.py | 20 +++++++---- pandas/tests/frame/indexing/test_xs.py | 38 +++++++++++++------- pandas/tests/frame/test_constructors.py | 11 ++++-- 5 files changed, 56 insertions(+), 24 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 01cb064b7aa47..49da809babf50 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1896,7 +1896,11 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): if not hasattr(self.obj._mgr, "blocks"): # ArrayManager - self.obj._mgr.column_setitem(loc, plane_indexer, value) + if com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)): + arr = self.obj._sanitize_column(value) + self.obj._mgr.iset(loc, arr) + else: + self.obj._mgr.column_setitem(loc, plane_indexer, value) self.obj._clear_item_cache() return diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index f169b32a0f086..356e4ced64c06 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -960,6 +960,9 @@ def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value): if self._has_no_reference(loc): # if no reference -> set array (potentially) inplace arr = self.arrays[loc] + # TODO we should try to avoid this (indexing.py::_setitem_single_column + # does a copy for the BM path as well) + arr = arr.copy() else: # otherwise perform Copy-on-Write and clear the reference arr = self.arrays[loc].copy() @@ -967,7 +970,7 @@ def column_setitem(self, loc: int, idx: int | slice | np.ndarray, value): # create temporary SingleArrayManager without ref to use setitem implementation mgr = SingleArrayManager([arr], [self._axes[0]]) - new_mgr = mgr.setitem(idx, value) + new_mgr = mgr.setitem((idx,), value) # update existing ArrayManager in-place self.arrays[loc] = new_mgr.arrays[0] diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index e2121fa2318eb..5a48252ab7d18 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -269,7 +269,7 @@ def test_setattr_column(self): df.foobar = 5 assert (df.foobar == 5).all() - def test_setitem(self, float_frame): + def test_setitem(self, float_frame, using_array_manager): # not sure what else to do here series = float_frame["A"][::2] float_frame["col5"] = series @@ -305,8 +305,12 @@ def test_setitem(self, float_frame): smaller = float_frame[:2] msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: + # With ArrayManager, adding a new column doesn't raise a warning smaller["col10"] = ["1", "2"] + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + smaller["col10"] = ["1", "2"] assert smaller["col10"].dtype == np.object_ assert (smaller["col10"] == ["1", "2"]).all() @@ -1004,14 +1008,18 @@ def test_iloc_row_slice_view(self, using_array_manager): # setting it makes it raise/warn subset = df.iloc[slice(4, 8)] + exp_col = original[2].copy() + msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: + # INFO(ArrayManager) doesn't modify parent subset[2] = 0.0 + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + subset[2] = 0.0 - exp_col = original[2].copy() - # TODO(ArrayManager) verify it is expected that the original didn't change - if not using_array_manager: exp_col[4:8] = 0.0 + tm.assert_series_equal(df[2], exp_col) def test_iloc_col(self): diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index ccd989e2de411..51127a6b0d05a 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -119,9 +119,7 @@ def test_xs_view(self, using_array_manager): if using_array_manager: # INFO(ArrayManager) with ArrayManager getting a row as a view is # not possible - msg = r"\nA value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): - dm.xs(2)[:] = 20 + dm.xs(2)[:] = 20 assert not (dm.xs(2) == 20).any() else: dm.xs(2)[:] = 20 @@ -158,27 +156,41 @@ def test_xs_level_eq_2(self): result = df.xs("c", level=2) tm.assert_frame_equal(result, expected) - def test_xs_setting_with_copy_error(self, multiindex_dataframe_random_data): + def test_xs_setting_with_copy_error( + self, multiindex_dataframe_random_data, using_array_manager + ): # this is a copy in 0.14 df = multiindex_dataframe_random_data + df_orig = df.copy() result = df.xs("two", level="second") - # setting this will give a SettingWithCopyError - # as we are trying to write a view - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if not using_array_manager: + # setting this will give a SettingWithCopyError + # as we are trying to write a view + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + result[:] = 10 + else: result[:] = 10 + tm.assert_frame_equal(df, df_orig) - def test_xs_setting_with_copy_error_multiple(self, four_level_index_dataframe): + def test_xs_setting_with_copy_error_multiple( + self, four_level_index_dataframe, using_array_manager + ): # this is a copy in 0.14 df = four_level_index_dataframe + df_orig = df.copy() result = df.xs(("a", 4), level=["one", "four"]) - # setting this will give a SettingWithCopyError - # as we are trying to write a view - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if not using_array_manager: + # setting this will give a SettingWithCopyError + # as we are trying to write a view + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + result[:] = 10 + else: result[:] = 10 + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize("key, level", [("one", "second"), (["one"], ["second"])]) def test_xs_with_duplicates(self, key, level, multiindex_dataframe_random_data): diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 1d286e379da86..98139459fd4ac 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -240,11 +240,16 @@ def test_constructor_dtype_copy(self): new_df["col1"] = 200.0 assert orig_df["col1"][0] == 1.0 - def test_constructor_dtype_nocast_view_dataframe(self): + def test_constructor_dtype_nocast_view_dataframe(self, using_array_manager): df = DataFrame([[1, 2]]) should_be_view = DataFrame(df, dtype=df[0].dtype) - should_be_view[0][0] = 99 - assert df.values[0, 0] == 99 + if using_array_manager: + # INFO(ArrayManager) doesn't mutate original + should_be_view.iloc[0, 0] = 99 + assert df.values[0, 0] == 1 + else: + should_be_view[0][0] = 99 + assert df.values[0, 0] == 99 @td.skip_array_manager_invalid_test # TODO(ArrayManager) keep view on 2D array? def test_constructor_dtype_nocast_view_2d_array(self): From 7f183de69600357278149fe83335fabf927deced Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 26 Jun 2021 14:29:15 +0200 Subject: [PATCH 07/21] fix ref tracking in slicing columns --- pandas/core/internals/array_manager.py | 8 +++--- pandas/tests/indexing/test_copy_on_write.py | 30 +++++++++++++++++++-- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 356e4ced64c06..59eeb04c7f644 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -812,14 +812,16 @@ def get_slice(self, slobj: slice, axis: int = 0) -> ArrayManager: if axis == 0: arrays = [arr[slobj] for arr in self.arrays] + # slicing results in views -> track references to original arrays + # TODO possible to optimizate this with single ref to the full ArrayManager? + refs = [weakref.ref(arr) for arr in self.arrays] elif axis == 1: arrays = self.arrays[slobj] + # track reference to subset of column arrays + refs = [weakref.ref(arr) for arr in arrays] new_axes = list(self._axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - # slicing results in views -> track references to original arrays - # TODO possible to optimizate this with single ref to the full ArrayManager? - refs = [weakref.ref(arr) for arr in self.arrays] return type(self)(arrays, new_axes, refs, verify_integrity=False) diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py index 7994905f9125b..ce68b24ca718b 100644 --- a/pandas/tests/indexing/test_copy_on_write.py +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -176,6 +176,8 @@ def test_subset_row_slice(using_array_manager): df_orig = df.copy() subset = df[1:3] + subset._mgr._verify_integrity() + assert np.may_share_memory(subset["a"].values, df["a"].values) if using_array_manager: @@ -200,6 +202,30 @@ def test_subset_row_slice(using_array_manager): tm.assert_frame_equal(df, df_orig) +def test_subset_column_slice(using_array_manager): + # Case: taking a subset of the columns of a DataFrame using a slice + # + afterwards modifying the subset + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + subset = df.iloc[:, 1:] + subset._mgr._verify_integrity() + + if using_array_manager: + assert np.may_share_memory(subset["b"].values, df["b"].values) + + subset.iloc[0, 0] = 0 + assert not np.may_share_memory(subset["b"].values, df["b"].values) + + else: + subset.iloc[0, 0] = 0 + + expected = pd.DataFrame({"b": [0, 5, 6], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(subset, expected) + # original parent dataframe is not modified (also not for BlockManager case) + tm.assert_frame_equal(df, df_orig) + + @pytest.mark.parametrize( "indexer", [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], @@ -333,8 +359,8 @@ def test_subset_set_with_column_indexer(indexer, using_array_manager): expected = pd.DataFrame( {"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3) ) - if not using_array_manager: - expected["b"] = expected["b"].astype("int64") + # TODO full row slice .loc[:, idx] update inplace instead of overwrite? + expected["b"] = expected["b"].astype("int64") tm.assert_frame_equal(subset, expected) if using_array_manager: tm.assert_frame_equal(df, df_orig) From 693bc4f7cb1a6362fe87af596280d83a514f2f43 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 28 Jun 2021 09:41:52 +0200 Subject: [PATCH 08/21] fix series setitem -> don't update parent cache --- pandas/core/series.py | 6 ++++- pandas/tests/indexing/test_copy_on_write.py | 26 +++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 5b8c1cd27c189..11d1af61d032e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1227,7 +1227,11 @@ def _maybe_update_cacher( if ref is None: del self._cacher else: - if len(self) == len(ref): + # for ArrayManager with CoW, we never want to update the parent + # DataFrame cache if the Series changed, and always pop the cached item + if len(self) == len(ref) and not isinstance( + self._mgr, SingleArrayManager + ): # otherwise, either self or ref has swapped in new arrays ref._maybe_cache_changed(cacher[0], self) else: diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py index ce68b24ca718b..780e9d11b107c 100644 --- a/pandas/tests/indexing/test_copy_on_write.py +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -477,6 +477,32 @@ def test_column_as_series(using_array_manager): tm.assert_frame_equal(df, df_orig) +def test_column_as_series_set_with_upcast(using_array_manager): + # Case: selecting a single column now also uses Copy-on-Write -> when + # setting a value causes an upcast, we don't need to update the parent + # DataFrame through the cache mechanism + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + s = df["a"] + if using_array_manager: + s[0] = "foo" + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + s[0] = "foo" + + expected = pd.Series(["foo", 2, 3], dtype=object, name="a") + tm.assert_series_equal(s, expected) + if using_array_manager: + tm.assert_frame_equal(df, df_orig) + # ensure cached series on getitem is not the changed series + tm.assert_series_equal(df["a"], df_orig["a"]) + else: + df_orig["a"] = expected + tm.assert_frame_equal(df, df_orig) + + # TODO add tests for other indexing methods on the Series From a154591e6976729724c2ac1e26d269803ceff97f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 28 Jun 2021 18:26:21 +0200 Subject: [PATCH 09/21] update indexing tests with new behaviour to get them passing --- pandas/core/internals/array_manager.py | 1 + pandas/tests/indexes/period/test_period.py | 8 +- .../multiindex/test_chaining_and_caching.py | 1 + .../tests/indexing/multiindex/test_setitem.py | 38 ++++-- .../indexing/test_chaining_and_caching.py | 126 ++++++++++++------ pandas/tests/indexing/test_iloc.py | 25 +++- pandas/tests/indexing/test_loc.py | 25 ++-- pandas/tests/indexing/test_scalar.py | 10 +- .../series/accessors/test_dt_accessor.py | 13 +- pandas/tests/series/indexing/test_indexing.py | 9 +- pandas/tests/series/methods/test_copy.py | 32 +++-- 11 files changed, 208 insertions(+), 80 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 59eeb04c7f644..2330788c997f5 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -378,6 +378,7 @@ def where(self: T, other, cond, align: bool, errors: str) -> T: # return self.apply_with_block("setitem", indexer=indexer, value=value) def putmask(self, mask, new, align: bool = True): + # TODO(CoW) check references to copy if needed if align: align_keys = ["new", "mask"] else: diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py index 83c82c18f3d1e..519d21c044011 100644 --- a/pandas/tests/indexes/period/test_period.py +++ b/pandas/tests/indexes/period/test_period.py @@ -255,16 +255,20 @@ def test_is_(self): assert not index.is_(index - 2) assert not index.is_(index - 0) - def test_index_duplicate_periods(self): + def test_index_duplicate_periods(self, using_array_manager): # monotonic idx = PeriodIndex([2000, 2007, 2007, 2009, 2009], freq="A-JUN") ts = Series(np.random.randn(len(idx)), index=idx) + original = ts.copy() result = ts["2007"] expected = ts[1:3] tm.assert_series_equal(result, expected) result[:] = 1 - assert (ts[1:3] == 1).all() + if using_array_manager: + tm.assert_series_equal(ts, original) + else: + assert (ts[1:3] == 1).all() # not monotonic idx = PeriodIndex([2000, 2007, 2007, 2009, 2007], freq="A-JUN") diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index 6ccd44e698a8a..d4fe88c835ee1 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -12,6 +12,7 @@ import pandas.core.common as com +@td.skip_array_manager_not_yet_implemented def test_detect_chained_assignment(): # Inplace ops, originally from: # https://stackoverflow.com/questions/20508968/series-fillna-in-a-multiindex-dataframe-does-not-fill-is-this-a-bug diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 5d0aeba4aebbc..c10bb6af195d2 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -390,16 +390,24 @@ def test_setitem_change_dtype(self, multiindex_dataframe_random_data): reindexed = dft.reindex(columns=[("foo", "two")]) tm.assert_series_equal(reindexed["foo", "two"], s > s.median()) - def test_set_column_scalar_with_loc(self, multiindex_dataframe_random_data): + def test_set_column_scalar_with_loc( + self, multiindex_dataframe_random_data, using_array_manager + ): frame = multiindex_dataframe_random_data subset = frame.index[[1, 4, 5]] frame.loc[subset] = 99 assert (frame.loc[subset].values == 99).all() + frame_original = frame.copy() + col = frame["B"] col[subset] = 97 - assert (frame.loc[subset, "B"] == 97).all() + if using_array_manager: + # chained setitem doesn't work with CoW + tm.assert_frame_equal(frame, frame_original) + else: + assert (frame.loc[subset, "B"] == 97).all() def test_nonunique_assignment_1750(self): df = DataFrame( @@ -472,21 +480,31 @@ def test_frame_setitem_view_direct(multiindex_dataframe_random_data): assert (df["foo"].values == 0).all() -def test_frame_setitem_copy_raises(multiindex_dataframe_random_data): +def test_frame_setitem_copy_raises( + multiindex_dataframe_random_data, using_array_manager +): # will raise/warn as its chained assignment df = multiindex_dataframe_random_data.T - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: + # TODO(CoW) it would be nice if this could still warn/raise df["foo"]["one"] = 2 + else: + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + df["foo"]["one"] = 2 -def test_frame_setitem_copy_no_write(multiindex_dataframe_random_data): +def test_frame_setitem_copy_no_write( + multiindex_dataframe_random_data, using_array_manager +): frame = multiindex_dataframe_random_data.T expected = frame df = frame.copy() - msg = "A value is trying to be set on a copy of a slice from a DataFrame" - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: df["foo"]["one"] = 2 + else: + msg = "A value is trying to be set on a copy of a slice from a DataFrame" + with pytest.raises(com.SettingWithCopyError, match=msg): + df["foo"]["one"] = 2 - result = df - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index a38c652953fab..8780ba2b7c194 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -31,7 +31,7 @@ def random_text(nobs=100): class TestCaching: - def test_slice_consolidate_invalidate_item_cache(self): + def test_slice_consolidate_invalidate_item_cache(self, using_array_manager): # this is chained assignment, but will 'work' with option_context("chained_assignment", None): @@ -51,7 +51,11 @@ def test_slice_consolidate_invalidate_item_cache(self): # Assignment to wrong series df["bb"].iloc[0] = 0.17 df._clear_item_cache() - tm.assert_almost_equal(df["bb"][0], 0.17) + if not using_array_manager: + tm.assert_almost_equal(df["bb"][0], 0.17) + else: + # with ArrayManager, parent is not mutated with chained assignment + tm.assert_almost_equal(df["bb"][0], 2.2) @pytest.mark.parametrize("do_ref", [True, False]) def test_setitem_cache_updating(self, do_ref): @@ -70,7 +74,7 @@ def test_setitem_cache_updating(self, do_ref): assert df.loc[0, "c"] == 0.0 assert df.loc[7, "c"] == 1.0 - def test_setitem_cache_updating_slices(self): + def test_setitem_cache_updating_slices(self, using_array_manager): # GH 7084 # not updating cache on series setting with slices expected = DataFrame( @@ -91,12 +95,17 @@ def test_setitem_cache_updating_slices(self): # try via a chain indexing # this actually works out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) + out_original = out.copy() for ix, row in df.iterrows(): v = out[row["C"]][six:eix] + row["D"] out[row["C"]][six:eix] = v - tm.assert_frame_equal(out, expected) - tm.assert_series_equal(out["A"], expected["A"]) + if not using_array_manager: + tm.assert_frame_equal(out, expected) + tm.assert_series_equal(out["A"], expected["A"]) + else: + tm.assert_frame_equal(out, out_original) + tm.assert_series_equal(out["A"], out_original["A"]) out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014")) for ix, row in df.iterrows(): @@ -122,6 +131,8 @@ def test_altering_series_clears_parent_cache(self): class TestChaining: + # TODO(CoW) fix Series setitem with mask + @td.skip_array_manager_not_yet_implemented def test_setitem_chained_setfault(self): # GH6026 @@ -157,18 +168,22 @@ def test_setitem_chained_setfault(self): tm.assert_frame_equal(result, expected) @pytest.mark.arm_slow - def test_detect_chained_assignment(self): + def test_detect_chained_assignment(self, using_array_manager): pd.set_option("chained_assignment", "raise") # work with the chain expected = DataFrame([[-5, 1], [-6, 3]], columns=list("AB")) df = DataFrame(np.arange(4).reshape(2, 2), columns=list("AB"), dtype="int64") + df_original = df.copy() assert df._is_copy is None df["A"][0] = -5 df["A"][1] = -6 - tm.assert_frame_equal(df, expected) + if using_array_manager: + tm.assert_frame_equal(df, df_original) + else: + tm.assert_frame_equal(df, expected) @pytest.mark.arm_slow def test_detect_chained_assignment_raises(self, using_array_manager): @@ -180,6 +195,7 @@ def test_detect_chained_assignment_raises(self, using_array_manager): "B": np.array(np.arange(2, 4), dtype=np.float64), } ) + df_original = df.copy() assert df._is_copy is None if not using_array_manager: @@ -196,12 +212,10 @@ def test_detect_chained_assignment_raises(self, using_array_manager): # a mixed dataframe df["A"][0] = -5 df["A"][1] = -6 - expected = DataFrame([[-5, 2], [-6, 3]], columns=list("AB")) - expected["B"] = expected["B"].astype("float64") - tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df, df_original) @pytest.mark.arm_slow - def test_detect_chained_assignment_fails(self): + def test_detect_chained_assignment_fails(self, using_array_manager): # Using a copy (the chain), fails df = DataFrame( @@ -211,11 +225,15 @@ def test_detect_chained_assignment_fails(self): } ) - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: + # TODO(CoW) can we still warn here? df.loc[0]["A"] = -5 + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + df.loc[0]["A"] = -5 @pytest.mark.arm_slow - def test_detect_chained_assignment_doc_example(self): + def test_detect_chained_assignment_doc_example(self, using_array_manager): # Doc example df = DataFrame( @@ -226,30 +244,36 @@ def test_detect_chained_assignment_doc_example(self): ) assert df._is_copy is None - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: + # TODO(CoW) can we still warn here? indexer = df.a.str.startswith("o") df[indexer]["c"] = 42 + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + indexer = df.a.str.startswith("o") + df[indexer]["c"] = 42 @pytest.mark.arm_slow def test_detect_chained_assignment_object_dtype(self, using_array_manager): expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]}) df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) - - with pytest.raises(com.SettingWithCopyError, match=msg): - df.loc[0]["A"] = 111 + df_original = df.copy() if not using_array_manager: + with pytest.raises(com.SettingWithCopyError, match=msg): + df.loc[0]["A"] = 111 + with pytest.raises(com.SettingWithCopyError, match=msg): df["A"][0] = 111 df.loc[0, "A"] = 111 + tm.assert_frame_equal(df, expected) else: - # INFO(ArrayManager) for ArrayManager it doesn't matter that it's - # a mixed dataframe + # TODO(CoW) can we still warn here? df["A"][0] = 111 - - tm.assert_frame_equal(df, expected) + df.loc[0]["A"] = 111 + tm.assert_frame_equal(df, df_original) @pytest.mark.arm_slow def test_detect_chained_assignment_is_copy_pickle(self): @@ -297,6 +321,7 @@ def test_detect_chained_assignment_implicit_take(self): df["letters"] = df["letters"].apply(str.lower) @pytest.mark.arm_slow + @td.skip_array_manager_invalid_test # _is_copy is not always set for AM def test_detect_chained_assignment_implicit_take2(self): # Implicitly take 2 @@ -354,15 +379,21 @@ def test_detect_chained_assignment_false_positives(self): str(df) @pytest.mark.arm_slow - def test_detect_chained_assignment_undefined_column(self): + def test_detect_chained_assignment_undefined_column(self, using_array_manager): # from SO: # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc df = DataFrame(np.arange(0, 9), columns=["count"]) df["group"] = "b" + df_original = df.copy() - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: + # TODO(CoW) can we still warn here? df.iloc[0:5]["group"] = "a" + tm.assert_frame_equal(df, df_original) + else: + with pytest.raises(com.SettingWithCopyError, match=msg): + df.iloc[0:5]["group"] = "a" @pytest.mark.arm_slow def test_detect_chained_assignment_changing_dtype(self, using_array_manager): @@ -376,32 +407,40 @@ def test_detect_chained_assignment_changing_dtype(self, using_array_manager): "D": ["a", "b", "c", "d", "e"], } ) + df_original = df.copy() - with pytest.raises(com.SettingWithCopyError, match=msg): - df.loc[2]["D"] = "foo" + if not using_array_manager: + with pytest.raises(com.SettingWithCopyError, match=msg): + df.loc[2]["D"] = "foo" - with pytest.raises(com.SettingWithCopyError, match=msg): - df.loc[2]["C"] = "foo" + with pytest.raises(com.SettingWithCopyError, match=msg): + df.loc[2]["C"] = "foo" - if not using_array_manager: with pytest.raises(com.SettingWithCopyError, match=msg): df["C"][2] = "foo" else: - # INFO(ArrayManager) for ArrayManager it doesn't matter if it's - # changing the dtype or not + # breakpoint() + # TODO(CoW) can we still warn here? + # df.loc[2]["D"] = "foo" + # df.loc[2]["C"] = "foo" df["C"][2] = "foo" - assert df.loc[2, "C"] == "foo" + tm.assert_frame_equal(df, df_original) - def test_setting_with_copy_bug(self): + def test_setting_with_copy_bug(self, using_array_manager): # operating on a copy df = DataFrame( {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]} ) + df_original = df.copy() mask = pd.isna(df.c) - with pytest.raises(com.SettingWithCopyError, match=msg): + if not using_array_manager: + with pytest.raises(com.SettingWithCopyError, match=msg): + df[["c"]][mask] = df[["b"]][mask] + else: df[["c"]][mask] = df[["b"]][mask] + tm.assert_frame_equal(df, df_original) def test_setting_with_copy_bug_no_warning(self): # invalid warning as we are returning a new object @@ -412,6 +451,7 @@ def test_setting_with_copy_bug_no_warning(self): # this should not raise df2["y"] = ["g", "h", "i"] + @td.skip_array_manager_not_yet_implemented # TODO(CoW) can we still warn/raise? def test_detect_chained_assignment_warnings_errors(self): df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]}) with option_context("chained_assignment", "warn"): @@ -422,18 +462,25 @@ def test_detect_chained_assignment_warnings_errors(self): with pytest.raises(com.SettingWithCopyError, match=msg): df.loc[0]["A"] = 111 - def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self): + def test_detect_chained_assignment_warnings_filter_and_dupe_cols( + self, using_array_manager + ): # xref gh-13017. with option_context("chained_assignment", "warn"): df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"]) + df_original = df.copy() - with tm.assert_produces_warning(com.SettingWithCopyWarning): + warn = None if using_array_manager else com.SettingWithCopyWarning + with tm.assert_produces_warning(warn): df.c.loc[df.c > 0] = None expected = DataFrame( [[1, 2, 3], [4, 5, 6], [7, 8, -9]], columns=["a", "a", "c"] ) - tm.assert_frame_equal(df, expected) + if using_array_manager: + tm.assert_frame_equal(df, df_original) + else: + tm.assert_frame_equal(df, expected) # TODO(ArrayManager) fast_xs with array-like scalars is not yet working @td.skip_array_manager_not_yet_implemented @@ -484,7 +531,7 @@ def test_cache_updating2(self): expected = Series([0, 0, 0, 2, 0], name="f") tm.assert_series_equal(df.f, expected) - def test_iloc_setitem_chained_assignment(self): + def test_iloc_setitem_chained_assignment(self, using_array_manager): # GH#3970 with option_context("chained_assignment", None): df = DataFrame({"aa": range(5), "bb": [2.2] * 5}) @@ -498,7 +545,10 @@ def test_iloc_setitem_chained_assignment(self): df_tmp = df.iloc[ck] # noqa df["bb"].iloc[0] = 0.15 - assert df["bb"].iloc[0] == 0.15 + if not using_array_manager: + assert df["bb"].iloc[0] == 0.15 + else: + assert df["bb"].iloc[0] == 2.2 def test_getitem_loc_assignment_slice_state(self): # GH 13569 diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index b04a2c86a79d7..987824a735607 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -80,10 +80,13 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage indexer(df)[key, 0] = cat overwrite = isinstance(key, slice) and key == slice(None) + if using_array_manager and indexer == tm.loc: + # TODO(ArrayManager) with loc, slice(3) gets converted into slice(0, 3) + # which is then considered as "full" slice and does overwrite. For iloc + # this conversion is not done, and so it doesn't overwrite + overwrite = overwrite or (isinstance(key, slice) and key == slice(3)) - if overwrite or using_array_manager: - # TODO(ArrayManager) we always overwrite because ArrayManager takes - # the "split" path, which still overwrites + if overwrite: # TODO: GH#39986 this probably shouldn't behave differently expected = DataFrame({0: cat}) assert not np.shares_memory(df.values, orig_vals) @@ -107,6 +110,8 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage orig_vals = df.values indexer(df)[key, 0] = cat expected = DataFrame({0: cat, 1: range(3)}) + if using_array_manager and not overwrite: + expected[0] = expected[0].astype(object) tm.assert_frame_equal(df, expected) # TODO(ArrayManager) does not yet update parent @@ -847,7 +852,11 @@ def test_identity_slice_returns_new_object(self, using_array_manager): # should also be a shallow copy original_series[:3] = [7, 8, 9] - assert all(sliced_series[:3] == [7, 8, 9]) + if using_array_manager: + # shallow copy not updated (CoW) + assert all(sliced_series[:3] == [1, 2, 3]) + else: + assert all(sliced_series[:3] == [7, 8, 9]) def test_indexing_zerodim_np_array(self): # GH24919 @@ -1308,8 +1317,9 @@ def test_frame_iloc_setitem_callable(self): class TestILocSeries: - def test_iloc(self): + def test_iloc(self, using_array_manager): ser = Series(np.random.randn(10), index=list(range(0, 20, 2))) + ser_original = ser.copy() for i in range(len(ser)): result = ser.iloc[i] @@ -1323,7 +1333,10 @@ def test_iloc(self): # test slice is a view result[:] = 0 - assert (ser[1:3] == 0).all() + if using_array_manager: + tm.assert_series_equal(ser, ser_original) + else: + assert (ser[1:3] == 0).all() # list of integers result = ser.iloc[[0, 2, 3, 4, 5]] diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index e96b25418d408..cf818be9bce26 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -600,29 +600,33 @@ def test_loc_setitem_frame_with_reindex(self, using_array_manager): # setting integer values into a float dataframe with loc is inplace, # so we retain float dtype ser = Series([2, 3, 1], index=[3, 5, 4], dtype=float) - if using_array_manager: - # TODO(ArrayManager) with "split" path, we still overwrite the column - # and therefore don't take the dtype of the underlying object into account - ser = Series([2, 3, 1], index=[3, 5, 4], dtype="int64") expected = DataFrame({"A": ser}) tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_with_reindex_mixed(self): + def test_loc_setitem_frame_with_reindex_mixed(self, using_array_manager): # GH#40480 df = DataFrame(index=[3, 5, 4], columns=["A", "B"], dtype=float) df["B"] = "string" df.loc[[4, 3, 5], "A"] = np.array([1, 2, 3], dtype="int64") - ser = Series([2, 3, 1], index=[3, 5, 4], dtype="int64") + ser = Series([2, 3, 1], index=[3, 5, 4], dtype=float) + if not using_array_manager: + # when using BlockManager, this takes the "split" path, which + # still overwrites the column + ser = Series([2, 3, 1], index=[3, 5, 4], dtype="int64") expected = DataFrame({"A": ser}) expected["B"] = "string" tm.assert_frame_equal(df, expected) - def test_loc_setitem_frame_with_inverted_slice(self): + def test_loc_setitem_frame_with_inverted_slice(self, using_array_manager): # GH#40480 df = DataFrame(index=[1, 2, 3], columns=["A", "B"], dtype=float) df["B"] = "string" df.loc[slice(3, 0, -1), "A"] = np.array([1, 2, 3], dtype="int64") - expected = DataFrame({"A": [3, 2, 1], "B": "string"}, index=[1, 2, 3]) + expected = DataFrame({"A": [3.0, 2.0, 1.0], "B": "string"}, index=[1, 2, 3]) + if not using_array_manager: + # when using BlockManager, this takes the "split" path, which + # still overwrites the column + expected["A"] = expected["A"].astype("int64") tm.assert_frame_equal(df, expected) # TODO(ArrayManager) "split" path overwrites column and therefore don't take @@ -988,7 +992,10 @@ def test_identity_slice_returns_new_object(self, using_array_manager): assert original_series[:] is not original_series original_series[:3] = [7, 8, 9] - assert all(sliced_series[:3] == [7, 8, 9]) + if using_array_manager: + assert all(sliced_series[:3] == [1, 2, 3]) + else: + assert all(sliced_series[:3] == [7, 8, 9]) @pytest.mark.xfail(reason="accidental fix reverted - GH37497") def test_loc_copy_vs_view(self): diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index 39611bce2b4fa..1f99947d77d36 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -7,6 +7,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, Series, @@ -46,7 +48,11 @@ def _check(f, func, values=False): for f in [d["ints"], d["uints"], d["labels"], d["ts"], d["floats"]]: _check(f, "at") - @pytest.mark.parametrize("kind", ["series", "frame"]) + # TODO(CoW) fix at/iat setting on a DataFrame + @pytest.mark.parametrize( + "kind", + ["series", pytest.param("frame", marks=td.skip_array_manager_invalid_test)], + ) def test_at_and_iat_set(self, kind): def _check(f, func, values=False): @@ -207,6 +213,8 @@ def test_mixed_index_at_iat_loc_iloc_dataframe(self): with pytest.raises(KeyError, match="^3$"): df.loc[0, 3] + # TODO(CoW) fix at/iat setting on a DataFrame + @td.skip_array_manager_invalid_test def test_iat_setter_incompatible_assignment(self): # GH 23236 result = DataFrame({"a": [0, 1], "b": [4, 5]}) diff --git a/pandas/tests/series/accessors/test_dt_accessor.py b/pandas/tests/series/accessors/test_dt_accessor.py index 61a22dad5d09b..2ee51c9e33566 100644 --- a/pandas/tests/series/accessors/test_dt_accessor.py +++ b/pandas/tests/series/accessors/test_dt_accessor.py @@ -41,7 +41,7 @@ class TestSeriesDatetimeValues: - def test_dt_namespace_accessor(self): + def test_dt_namespace_accessor(self, using_array_manager): # GH 7207, 11128 # test .dt namespace accessor @@ -258,10 +258,15 @@ def get_dir(s): s.dt.hour = 5 # trying to set a copy - msg = "modifications to a property of a datetimelike.+not supported" - with pd.option_context("chained_assignment", "raise"): - with pytest.raises(com.SettingWithCopyError, match=msg): + if using_array_manager: + # TODO(CoW) it would be nice to keep a warning/error for this case + with pd.option_context("chained_assignment", "raise"): s.dt.hour[0] = 5 + else: + msg = "modifications to a property of a datetimelike.+not supported" + with pd.option_context("chained_assignment", "raise"): + with pytest.raises(com.SettingWithCopyError, match=msg): + s.dt.hour[0] = 5 @pytest.mark.parametrize( "method, dates", diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index 6c3587c7eeada..c0a52e642dce4 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -204,7 +204,8 @@ def test_basic_getitem_setitem_corner(datetime_series): datetime_series[[5, slice(None, None)]] = 2 -def test_slice(string_series, object_series): +def test_slice(string_series, object_series, using_array_manager): + original = string_series.copy() numSlice = string_series[10:20] numSliceEnd = string_series[-10:] objSlice = object_series[10:20] @@ -222,7 +223,11 @@ def test_slice(string_series, object_series): sl = string_series[10:20] sl[:] = 0 - assert (string_series[10:20] == 0).all() + if using_array_manager: + # Doesn't modify parent (CoW) + tm.assert_series_equal(string_series, original) + else: + assert (string_series[10:20] == 0).all() def test_timedelta_assignment(): diff --git a/pandas/tests/series/methods/test_copy.py b/pandas/tests/series/methods/test_copy.py index 8aa5c14812dc0..26966592d563b 100644 --- a/pandas/tests/series/methods/test_copy.py +++ b/pandas/tests/series/methods/test_copy.py @@ -9,20 +9,28 @@ class TestCopy: - @pytest.mark.parametrize("deep", [None, False, True]) - def test_copy(self, deep): + @pytest.mark.parametrize("deep", ["default", None, False, True]) + def test_copy(self, deep, using_array_manager): ser = Series(np.arange(10), dtype="float64") # default deep is True - if deep is None: + if deep == "default": ser2 = ser.copy() else: ser2 = ser.copy(deep=deep) + if using_array_manager: + # INFO(ArrayManager) a shallow copy doesn't yet copy the data + # but parent will not be modified (CoW) + if deep is None or deep is False: + assert np.may_share_memory(ser.values, ser2.values) + else: + assert not np.may_share_memory(ser.values, ser2.values) + ser2[::2] = np.NaN - if deep is None or deep is True: + if deep == "default" or deep is None or deep is True or using_array_manager: # Did not modify original Series assert np.isnan(ser2[0]) assert not np.isnan(ser[0]) @@ -31,8 +39,8 @@ def test_copy(self, deep): assert np.isnan(ser2[0]) assert np.isnan(ser[0]) - @pytest.mark.parametrize("deep", [None, False, True]) - def test_copy_tzaware(self, deep): + @pytest.mark.parametrize("deep", ["default", None, False, True]) + def test_copy_tzaware(self, deep, using_array_manager): # GH#11794 # copy of tz-aware expected = Series([Timestamp("2012/01/01", tz="UTC")]) @@ -40,15 +48,23 @@ def test_copy_tzaware(self, deep): ser = Series([Timestamp("2012/01/01", tz="UTC")]) - if deep is None: + if deep == "default": ser2 = ser.copy() else: ser2 = ser.copy(deep=deep) + if using_array_manager: + # INFO(ArrayManager) a shallow copy doesn't yet copy the data + # but parent will not be modified (CoW) + if deep is None or deep is False: + assert np.may_share_memory(ser.values, ser2.values) + else: + assert not np.may_share_memory(ser.values, ser2.values) + ser2[0] = Timestamp("1999/01/01", tz="UTC") # default deep is True - if deep is None or deep is True: + if deep == "default" or deep is None or deep is True or using_array_manager: # Did not modify original Series tm.assert_series_equal(ser2, expected2) tm.assert_series_equal(ser, expected) From 67413409a5dd06cacaa242b8926f5f5ad8f5ef10 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 4 Aug 2021 14:33:54 +0200 Subject: [PATCH 10/21] fix new test + skip Series.update tests for now --- pandas/core/frame.py | 8 ++++++-- pandas/tests/frame/methods/test_update.py | 4 ++++ .../tests/indexing/test_chaining_and_caching.py | 17 +++++++++++++---- pandas/tests/indexing/test_partial.py | 8 +++++++- pandas/tests/series/indexing/test_indexing.py | 4 ++++ pandas/tests/series/methods/test_update.py | 3 +++ 6 files changed, 37 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 6038994b55378..0729039fc2e17 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3801,8 +3801,12 @@ def _set_value( """ try: if takeable: - series = self._ixs(col, axis=1) - series._set_value(index, value, takeable=True) + if isinstance(self._mgr, ArrayManager): + # with CoW, we can't use intermediate series + self._mgr.column_setitem(col, index, value) + else: + series = self._ixs(col, axis=1) + series._set_value(index, value, takeable=True) return series = self._get_item_cache(col) diff --git a/pandas/tests/frame/methods/test_update.py b/pandas/tests/frame/methods/test_update.py index 408113e9bc417..add190de73995 100644 --- a/pandas/tests/frame/methods/test_update.py +++ b/pandas/tests/frame/methods/test_update.py @@ -1,6 +1,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd from pandas import ( DataFrame, @@ -138,6 +140,8 @@ def test_update_datetime_tz(self): expected = DataFrame([pd.Timestamp("2019", tz="UTC")]) tm.assert_frame_equal(result, expected) + # TODO(CoW) what should the update method do? -> deprecate this? + @td.skip_array_manager_not_yet_implemented def test_update_with_different_dtype(self): # GH#3217 df = DataFrame({"a": [1, 3], "b": [np.nan, 2]}) diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index fc82be4325b3d..8eba27a907280 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -483,14 +483,23 @@ def test_detect_chained_assignment_warnings_filter_and_dupe_cols( tm.assert_frame_equal(df, expected) @pytest.mark.parametrize("rhs", [3, DataFrame({0: [1, 2, 3, 4]})]) - def test_detect_chained_assignment_warning_stacklevel(self, rhs): + def test_detect_chained_assignment_warning_stacklevel( + self, rhs, using_array_manager + ): # GH#42570 df = DataFrame(np.arange(25).reshape(5, 5)) + df_original = df.copy() chained = df.loc[:3] with option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(com.SettingWithCopyWarning) as t: - chained[2] = rhs - assert t[0].filename == __file__ + if not using_array_manager: + with tm.assert_produces_warning(com.SettingWithCopyWarning) as t: + chained[2] = rhs + assert t[0].filename == __file__ + else: + # INFO(CoW) no warning, and original dataframe not changed + with tm.assert_produces_warning(None): + chained[2] = rhs + tm.assert_frame_equal(df, df_original) # TODO(ArrayManager) fast_xs with array-like scalars is not yet working @td.skip_array_manager_not_yet_implemented diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py index 693e67652c912..dd28baa9a45f9 100644 --- a/pandas/tests/indexing/test_partial.py +++ b/pandas/tests/indexing/test_partial.py @@ -74,7 +74,13 @@ def test_partial_setting(self): with pytest.raises(IndexError, match=msg): df.iloc[4, 2] = 5.0 - msg = "index 2 is out of bounds for axis 0 with size 2" + msg = "|".join( + [ + "index 2 is out of bounds for axis 0 with size 2", + # TODO(ArrayManager) improve error message + "list index out of range", + ] + ) with pytest.raises(IndexError, match=msg): df.iat[4, 2] = 5.0 diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py index c0a52e642dce4..5c6030d52af5c 100644 --- a/pandas/tests/series/indexing/test_indexing.py +++ b/pandas/tests/series/indexing/test_indexing.py @@ -5,6 +5,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import ( DataFrame, IndexSlice, @@ -244,6 +246,8 @@ def test_timedelta_assignment(): tm.assert_series_equal(s, expected) +# TODO(CoW) what should the update method do? -> deprecate this? +@td.skip_array_manager_not_yet_implemented def test_underlying_data_conversion(): # GH 4080 df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]}) diff --git a/pandas/tests/series/methods/test_update.py b/pandas/tests/series/methods/test_update.py index d9d6641d54237..17d05711ffefe 100644 --- a/pandas/tests/series/methods/test_update.py +++ b/pandas/tests/series/methods/test_update.py @@ -14,6 +14,9 @@ class TestUpdate: + + # TODO(CoW) what should the update method do? -> deprecate this? + @td.skip_array_manager_not_yet_implemented def test_update(self): s = Series([1.5, np.nan, 3.0, 4.0, np.nan]) s2 = Series([np.nan, 3.5, np.nan, 5.0]) From c4527e9d32673525306d906bc97ddbb8289093a2 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 4 Aug 2021 14:49:24 +0200 Subject: [PATCH 11/21] remove chained assignment outside indexing tests --- pandas/tests/frame/methods/test_dropna.py | 2 +- .../tests/frame/methods/test_interpolate.py | 28 ++++++++++--------- pandas/tests/frame/methods/test_isin.py | 6 ++-- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/pandas/tests/frame/methods/test_dropna.py b/pandas/tests/frame/methods/test_dropna.py index 76a6f3aa25362..bc2b48d3312d7 100644 --- a/pandas/tests/frame/methods/test_dropna.py +++ b/pandas/tests/frame/methods/test_dropna.py @@ -66,7 +66,7 @@ def test_dropIncompleteRows(self, float_frame): def test_dropna(self): df = DataFrame(np.random.randn(6, 4)) - df[2][:2] = np.nan + df.iloc[:2, 2] = np.nan dropped = df.dropna(axis=1) expected = df.loc[:, [0, 1, 3]] diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index d0551ffd5cffe..45129f7c0b366 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -102,34 +102,34 @@ def test_interp_various(self): expected = df.copy() result = df.interpolate(method="polynomial", order=1) - expected.A.loc[3] = 2.66666667 - expected.A.loc[13] = 5.76923076 + expected.loc[3, "A"] = 2.66666667 + expected.loc[13, "A"] = 5.76923076 tm.assert_frame_equal(result, expected) result = df.interpolate(method="cubic") # GH #15662. - expected.A.loc[3] = 2.81547781 - expected.A.loc[13] = 5.52964175 + expected.loc[3, "A"] = 2.81547781 + expected.loc[13, "A"] = 5.52964175 tm.assert_frame_equal(result, expected) result = df.interpolate(method="nearest") - expected.A.loc[3] = 2 - expected.A.loc[13] = 5 + expected.loc[3, "A"] = 2 + expected.loc[13, "A"] = 5 tm.assert_frame_equal(result, expected, check_dtype=False) result = df.interpolate(method="quadratic") - expected.A.loc[3] = 2.82150771 - expected.A.loc[13] = 6.12648668 + expected.loc[3, "A"] = 2.82150771 + expected.loc[13, "A"] = 6.12648668 tm.assert_frame_equal(result, expected) result = df.interpolate(method="slinear") - expected.A.loc[3] = 2.66666667 - expected.A.loc[13] = 5.76923077 + expected.loc[3, "A"] = 2.66666667 + expected.loc[13, "A"] = 5.76923077 tm.assert_frame_equal(result, expected) result = df.interpolate(method="zero") - expected.A.loc[3] = 2.0 - expected.A.loc[13] = 5 + expected.loc[3, "A"] = 2.0 + expected.loc[13, "A"] = 5 tm.assert_frame_equal(result, expected, check_dtype=False) @td.skip_if_no_scipy @@ -218,7 +218,7 @@ def test_interp_leading_nans(self, check_scipy): ) result = df.interpolate() expected = df.copy() - expected["B"].loc[3] = -3.75 + expected.loc[3, "B"] = -3.75 tm.assert_frame_equal(result, expected) if check_scipy: @@ -254,6 +254,8 @@ def test_interp_raise_on_all_object_dtype(self): with pytest.raises(TypeError, match=msg): df.interpolate() + # TODO(CoW) inplace method on Series -> OK with not updating parent? + @td.skip_array_manager_not_yet_implemented def test_interp_inplace(self): df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]}) expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]}) diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py index d2ebd09c4cc48..e924963f588f3 100644 --- a/pandas/tests/frame/methods/test_isin.py +++ b/pandas/tests/frame/methods/test_isin.py @@ -79,8 +79,8 @@ def test_isin_df(self): df2 = DataFrame({"A": [0, 2, 12, 4], "B": [2, np.nan, 4, 5]}) expected = DataFrame(False, df1.index, df1.columns) result = df1.isin(df2) - expected["A"].loc[[1, 3]] = True - expected["B"].loc[[0, 2]] = True + expected.loc[[1, 3], "A"] = True + expected.loc[[0, 2], "B"] = True tm.assert_frame_equal(result, expected) # partial overlapping columns @@ -133,7 +133,7 @@ def test_isin_against_series(self): ) s = Series([1, 3, 11, 4], index=["a", "b", "c", "d"]) expected = DataFrame(False, index=df.index, columns=df.columns) - expected["A"].loc["a"] = True + expected.loc["a", "A"] = True expected.loc["d"] = True result = df.isin(s) tm.assert_frame_equal(result, expected) From b33aaf2f4ae389760d38a930a8357aaae0b6771f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 4 Aug 2021 15:06:01 +0200 Subject: [PATCH 12/21] fix DataFrame.fillna + more chained setitem --- pandas/core/generic.py | 6 ++++++ pandas/tests/frame/methods/test_combine_first.py | 2 +- pandas/tests/frame/methods/test_cov_corr.py | 6 ++++-- pandas/tests/frame/test_arithmetic.py | 3 ++- 4 files changed, 13 insertions(+), 4 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ab1796b797612..cc2a4a4802d84 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6310,6 +6310,12 @@ def fillna( obj = result[k] downcast_k = downcast if not is_dict else downcast.get(k) obj.fillna(v, limit=limit, inplace=True, downcast=downcast_k) + if isinstance(self._mgr, ArrayManager): + # CoW -> we need to assign filled Series back to obj + # TODO(CoW) we should avoid this here, with inplace=False + # we already copied the original obj, in theory no need + # for copying on write again + result[k] = obj return result if not inplace else None elif not is_list_like(value): diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py index dd91b32c8eb8c..5e5256e1d18c2 100644 --- a/pandas/tests/frame/methods/test_combine_first.py +++ b/pandas/tests/frame/methods/test_combine_first.py @@ -66,7 +66,7 @@ def test_combine_first(self, float_frame): assert (combined["A"][:10] == 1).all() # reverse overlap - tail["A"][:10] = 0 + tail.iloc[:10, tail.columns.get_loc("A")] = 0 combined = tail.combine_first(head) assert (combined["A"][:10] == 0).all() diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 352d95156bf98..d01288114caf7 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -27,8 +27,10 @@ def test_cov(self, float_frame, float_string_frame): # with NAs frame = float_frame.copy() - frame["A"][:5] = np.nan - frame["B"][5:10] = np.nan + # frame["A"][:5] = np.nan + frame.iloc[:5, frame.columns.get_loc("A")] = np.nan + # frame["B"][5:10] = np.nan + frame.iloc[5:10, frame.columns.get_loc("B")] = np.nan result = frame.cov(min_periods=len(frame) - 8) expected = frame.cov() expected.loc["A", "B"] = np.nan diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py index afa9593807acc..c5d6a1b0a20a2 100644 --- a/pandas/tests/frame/test_arithmetic.py +++ b/pandas/tests/frame/test_arithmetic.py @@ -1183,7 +1183,8 @@ def test_combineFrame(self, float_frame, mixed_float_frame, mixed_int_frame): frame_copy = float_frame.reindex(float_frame.index[::2]) del frame_copy["D"] - frame_copy["C"][:5] = np.nan + # adding NAs to column "C" + frame_copy.iloc[:5, frame_copy.columns.get_loc("C")] = np.nan added = float_frame + frame_copy From f5da03f0a918d73c9bc10a746fef141b12d838f7 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 8 Aug 2021 10:25:43 +0200 Subject: [PATCH 13/21] reindex/take on columns to use copy-on-write --- pandas/core/frame.py | 2 +- pandas/core/generic.py | 6 ++---- pandas/core/internals/array_manager.py | 10 ++++++++-- pandas/core/internals/managers.py | 4 ++++ pandas/tests/indexing/test_copy_on_write.py | 20 ++++++++++++++++++++ 5 files changed, 35 insertions(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0729039fc2e17..4fc7f907480c5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4726,7 +4726,7 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): "labels", [ ("method", None), - ("copy", True), + ("copy", None), ("level", None), ("fill_value", np.nan), ("limit", None), diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 536b395fa7a50..ab739d96bfa4a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4781,7 +4781,7 @@ def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: axes, kwargs = self._construct_axes_from_arguments(args, kwargs) method = missing.clean_reindex_fill_method(kwargs.pop("method", None)) level = kwargs.pop("level", None) - copy = kwargs.pop("copy", True) + copy = kwargs.pop("copy", None) limit = kwargs.pop("limit", None) tolerance = kwargs.pop("tolerance", None) fill_value = kwargs.pop("fill_value", None) @@ -4806,9 +4806,7 @@ def reindex(self: FrameOrSeries, *args, **kwargs) -> FrameOrSeries: for axis, ax in axes.items() if ax is not None ): - if copy: - return self.copy() - return self + return self.copy(deep=copy) # check if we are a multi reindex if self._needs_reindex_multi(axes, method, level): diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index b532835cd0594..209dc6cacb5cc 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -612,6 +612,10 @@ def _reindex_indexer( pandas-indexer with -1's only. """ + if copy is None: + # use shallow copy + copy = False + if indexer is None: if new_axis is self._axes[axis] and not copy: return self @@ -640,9 +644,11 @@ def _reindex_indexer( else: # reusing full column array -> track with reference arr = self.arrays[i] - ref = weakref.ref(arr) if copy: arr = arr.copy() + ref = None + else: + ref = weakref.ref(arr) new_arrays.append(arr) refs.append(ref) @@ -692,7 +698,7 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T: new_labels = self._axes[axis].take(indexer) return self._reindex_indexer( - new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True + new_axis=new_labels, indexer=indexer, axis=axis, allow_dups=True, copy=None ) def _make_na_array(self, fill_value=None, use_na_proxy=False): diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index cbf58c4b6c1d7..d33f5df684494 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -645,6 +645,10 @@ def reindex_indexer( pandas-indexer with -1's only. """ + if copy is None: + # preserve deep copy for BlockManager with copy=None + copy = True + if indexer is None: if new_axis is self.axes[axis] and not copy: return self diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py index 780e9d11b107c..9f149b88b0707 100644 --- a/pandas/tests/indexing/test_copy_on_write.py +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -121,6 +121,26 @@ def test_rename_columns_modify_parent(using_array_manager): tm.assert_frame_equal(df2, df2_orig) +def test_reindex_columns(using_array_manager): + # Case: reindexing the column returns a new dataframe + # + afterwards modifying the result + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.reindex(columns=["a", "c"]) + + if using_array_manager: + # still shares memory (df2 is a shallow copy) + assert np.may_share_memory(df2["a"].values, df["a"].values) + else: + assert not np.may_share_memory(df2["a"].values, df["a"].values) + # mutating df2 triggers a copy-on-write for that column + df2.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["a"].values, df["a"].values) + if using_array_manager: + assert np.may_share_memory(df2["c"].values, df["c"].values) + tm.assert_frame_equal(df, df_orig) + + # ----------------------------------------------------------------------------- # Indexing operations taking subset + modifying the subset/parent From c63275750b14ad9aab3d41fdc0c5e3157a4d1bca Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 8 Aug 2021 11:08:23 +0200 Subject: [PATCH 14/21] fix some typing issues --- pandas/core/frame.py | 3 +++ pandas/core/internals/array_manager.py | 21 ++++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4fc7f907480c5..65df053890620 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3803,6 +3803,9 @@ def _set_value( if takeable: if isinstance(self._mgr, ArrayManager): # with CoW, we can't use intermediate series + # with takeable=True, we know that index is positional and + # not a generic hashable label + index = cast(int, index) self._mgr.column_setitem(col, index, value) else: series = self._ixs(col, axis=1) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 209dc6cacb5cc..d0e4f9a45e0f9 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -129,7 +129,7 @@ def __init__( self, arrays: list[np.ndarray | ExtensionArray], axes: list[Index], - refs: list[weakref.ref | None] | None, + refs: list[weakref.ref | None] | None = None, verify_integrity: bool = True, ): raise NotImplementedError @@ -562,7 +562,12 @@ def copy_func(ax): new_arrays = list(self.arrays) refs = [weakref.ref(arr) for arr in self.arrays] - return type(self)(new_arrays, new_axes, refs, verify_integrity=False) + # error: Argument 3 to "BaseArrayManager" has incompatible type + # "Optional[List[ReferenceType[Union[ndarray[Any, Any], ExtensionArray]]]]"; + # expected "Optional[List[Optional[ReferenceType[Any]]]]" [arg-type] + return type(self)( + new_arrays, new_axes, refs, verify_integrity=False # type: ignore[arg-type] + ) def reindex_indexer( self: T, @@ -571,7 +576,7 @@ def reindex_indexer( axis: int, fill_value=None, allow_dups: bool = False, - copy: bool = True, + copy: bool | None = True, # ignored keywords consolidate: bool = True, only_slice: bool = False, @@ -596,7 +601,7 @@ def _reindex_indexer( axis: int, fill_value=None, allow_dups: bool = False, - copy: bool = True, + copy: bool | None = True, use_na_proxy: bool = False, ) -> T: """ @@ -632,6 +637,7 @@ def _reindex_indexer( if axis >= self.ndim: raise IndexError("Requested axis not found in manager") + refs: list[weakref.ref | None] | None = None if axis == 1: new_arrays = [] refs = [] @@ -830,7 +836,12 @@ def get_slice(self, slobj: slice, axis: int = 0) -> ArrayManager: new_axes = list(self._axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - return type(self)(arrays, new_axes, refs, verify_integrity=False) + # error: Argument 3 to "ArrayManager" has incompatible type + # "List[ReferenceType[Union[ndarray[Any, Any], ExtensionArray]]]"; + # expected "Optional[List[Optional[ReferenceType[Any]]]]" [arg-type] + return type(self)( + arrays, new_axes, refs, verify_integrity=False # type: ignore[arg-type] + ) def iget(self, i: int) -> SingleArrayManager: """ From e4a5f332ef505cfde59ef6908ba105b464c8eb6f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 8 Aug 2021 11:10:18 +0200 Subject: [PATCH 15/21] fix dtype in test for windows --- pandas/tests/indexing/test_copy_on_write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py index 9f149b88b0707..9524353818143 100644 --- a/pandas/tests/indexing/test_copy_on_write.py +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -292,7 +292,7 @@ def test_subset_set_column(using_array_manager): subset = df[1:3] if using_array_manager: - subset["a"] = np.array([10, 11]) + subset["a"] = np.array([10, 11], dtype="int64") else: with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning(com.SettingWithCopyWarning): From 5efad508093c64252d9278fff075829b059ffed0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sun, 8 Aug 2021 14:21:52 +0200 Subject: [PATCH 16/21] fix dtype in test for windows --- pandas/tests/indexing/test_copy_on_write.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py index 9524353818143..97ae0db23f519 100644 --- a/pandas/tests/indexing/test_copy_on_write.py +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -296,7 +296,7 @@ def test_subset_set_column(using_array_manager): else: with pd.option_context("chained_assignment", "warn"): with tm.assert_produces_warning(com.SettingWithCopyWarning): - subset["a"] = np.array([10, 11]) + subset["a"] = np.array([10, 11], dtype="int64") expected = pd.DataFrame( {"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) From 297662a378755431e5d0b0733ffc9f29dd5084e8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Nov 2021 20:56:37 +0100 Subject: [PATCH 17/21] [ArrayManager] Array version of putmask logic --- pandas/core/array_algos/putmask.py | 97 +++++++++++++++++++++++++- pandas/core/internals/array_manager.py | 67 ++++++++++++++++-- 2 files changed, 156 insertions(+), 8 deletions(-) diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index 77e38e6c6e3fc..6dce202e5ca7f 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -15,6 +15,7 @@ ) from pandas.core.dtypes.cast import ( + can_hold_element, convert_scalar_for_putitemlike, find_common_type, infer_dtype_from, @@ -22,11 +23,22 @@ from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, + is_interval_dtype, is_list_like, ) -from pandas.core.dtypes.missing import isna_compat +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + is_valid_na_for_dtype, + isna_compat, + na_value_for_dtype, +) from pandas.core.arrays import ExtensionArray +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray def putmask_inplace(values: ArrayLike, mask: npt.NDArray[np.bool_], value: Any) -> None: @@ -122,7 +134,7 @@ def putmask_smart(values: np.ndarray, mask: npt.NDArray[np.bool_], new) -> np.nd nv[mask] = nn_at return nv - new = np.asarray(new) + # new = np.asarray(new) if values.dtype.kind == new.dtype.kind: # preserves dtype if possible @@ -225,3 +237,84 @@ def setitem_datetimelike_compat(values: np.ndarray, num_set: int, other): other = list(other) return other + + +def putmask_flexible_ndarray(array: np.ndarray, mask, new): + """ + Putmask implementation for ArrayManager putmask for ndarray. + + Flexible version that will upcast if needed. + """ + mask, noop = validate_putmask(array, mask) + assert not isinstance(new, (ABCIndex, ABCSeries, ABCDataFrame)) + + # if we are passed a scalar None, convert it here + if not array.dtype == "object" and is_valid_na_for_dtype(new, array.dtype): + new = na_value_for_dtype(array.dtype, compat=False) + + if can_hold_element(array, new): + # error: Argument 1 to "putmask_without_repeat" has incompatible type + # "Union[ndarray, ExtensionArray]"; expected "ndarray" + putmask_without_repeat(array, mask, new) # type: ignore[arg-type] + return array + + elif noop: + return array + + dtype, _ = infer_dtype_from(new) + if dtype.kind in ["m", "M"]: + array = array.astype(object) + # convert to list to avoid numpy coercing datetimelikes to integers + new = setitem_datetimelike_compat( + array, mask.sum(), new # type: ignore[arg-type] + ) + # putmask_smart below converts it back to array + np.putmask(array, mask, new) + return array + + new_values = putmask_smart(array, mask, new) + return new_values + + +def _coerce_to_target_dtype(array, new): + dtype, _ = infer_dtype_from(new, pandas_dtype=True) + new_dtype = find_common_type([array.dtype, dtype]) + return array.astype(new_dtype, copy=False) + + +def putmask_flexible_ea(array: ExtensionArray, mask, new): + """ + Putmask implementation for ArrayManager putmask for EA. + + Flexible version that will upcast if needed. + """ + mask = extract_bool_array(mask) + + if isinstance(array, NDArrayBackedExtensionArray): + + if not can_hold_element(array, new): + array = _coerce_to_target_dtype(array, new) + return putmask_flexible_ndarray(array, mask, new) + + array.putmask(mask, new) + return array + + if isinstance(new, (np.ndarray, ExtensionArray)) and len(new) == len(mask): + new = new[mask] + + try: + array[mask] = new + except TypeError: + if not is_interval_dtype(array.dtype): + # Discussion about what we want to support in the general + # case GH#39584 + raise + + array = _coerce_to_target_dtype(array, new) + if array.dtype == np.dtype("object"): + # For now at least, only support casting e.g. + # Interval[int64]->Interval[float64], + raise + return putmask_flexible_ea(array, mask, new) + + return array diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 6362ffa2851ff..f09fa76fddfd9 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -55,6 +55,10 @@ ) import pandas.core.algorithms as algos +from pandas.core.array_algos.putmask import ( + putmask_flexible_ea, + putmask_flexible_ndarray, +) from pandas.core.array_algos.quantile import quantile_compat from pandas.core.array_algos.take import take_1d from pandas.core.arrays import ( @@ -378,13 +382,64 @@ def putmask(self, mask, new, align: bool = True): else: align_keys = ["mask"] new = extract_array(new, extract_numpy=True) + # if self._has_no_reference(loc): + # breakpoint() + + mask = np.asarray(mask) + for i in range(len(self.arrays)): + if not self._has_no_reference(i): + # if being referenced -> perform Copy-on-Write and clear the reference + self.arrays[i] = self.arrays[i].copy() + self._clear_reference(i) + self.arrays[i][mask[:, i]] = new - return self.apply_with_block( - "putmask", - align_keys=align_keys, - mask=mask, - new=new, - ) + return self + + # if align: + # align_keys = ["new", "mask"] + # else: + # align_keys = ["mask"] + # new = extract_array(new, extract_numpy=True) + + + def putmask(self, mask, new, align: bool = True): + if align: + align_keys = ["new", "mask"] + else: + align_keys = ["mask"] + new = extract_array(new, extract_numpy=True) + + kwargs = {"mask": mask, "new": new} + aligned_kwargs = {k: kwargs[k] for k in align_keys} + + for i in range(len(self.arrays)): + + if not self._has_no_reference(i): + # if being referenced -> perform Copy-on-Write and clear the reference + self.arrays[i] = self.arrays[i].copy() + self._clear_reference(i) + + for k, obj in aligned_kwargs.items(): + if isinstance(obj, (ABCSeries, ABCDataFrame)): + # The caller is responsible for ensuring that + # obj.axes[-1].equals(self.items) + if obj.ndim == 1: + kwargs[k] = obj._values + else: + kwargs[k] = obj.iloc[:, i]._values + else: + # otherwise we have an ndarray + if self.ndim == 2: + kwargs[k] = obj[i] + + arr = self.arrays[i] + if isinstance(arr, np.ndarray): + new = putmask_flexible_ndarray(arr, **kwargs) + else: + new = putmask_flexible_ea(arr, **kwargs) + self.arrays[i] = new + + return self def diff(self: T, n: int, axis: int) -> T: if axis == 1: From a011a0685f4d42611f38eb7e5eafaf320bc6c79c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Nov 2021 21:01:52 +0100 Subject: [PATCH 18/21] fixup copy-on-write in putmask --- pandas/core/indexing.py | 3 +- pandas/core/internals/array_manager.py | 29 +--------- pandas/tests/indexing/test_copy_on_write.py | 61 +++++++++++++++++++++ 3 files changed, 64 insertions(+), 29 deletions(-) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 35be629f1f22c..4a6c21a9aaced 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -1840,7 +1840,8 @@ def _setitem_single_column(self, loc: int, value, plane_indexer): pi = plane_indexer if not hasattr(self.obj._mgr, "blocks"): - # ArrayManager + # ArrayManager: in this case we cannot rely on getting the column + # as a Series to mutate, but need to operated on the mgr directly if com.is_null_slice(pi) or com.is_full_slice(pi, len(self.obj)): arr = self.obj._sanitize_column(value) self.obj._mgr.iset(loc, arr) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index f09fa76fddfd9..1e4f025b112a4 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -375,33 +375,6 @@ def where(self: T, other, cond, align: bool) -> T: # def setitem(self, indexer, value) -> ArrayManager: # return self.apply_with_block("setitem", indexer=indexer, value=value) - def putmask(self, mask, new, align: bool = True): - # TODO(CoW) check references to copy if needed - if align: - align_keys = ["new", "mask"] - else: - align_keys = ["mask"] - new = extract_array(new, extract_numpy=True) - # if self._has_no_reference(loc): - # breakpoint() - - mask = np.asarray(mask) - for i in range(len(self.arrays)): - if not self._has_no_reference(i): - # if being referenced -> perform Copy-on-Write and clear the reference - self.arrays[i] = self.arrays[i].copy() - self._clear_reference(i) - self.arrays[i][mask[:, i]] = new - - return self - - # if align: - # align_keys = ["new", "mask"] - # else: - # align_keys = ["mask"] - # new = extract_array(new, extract_numpy=True) - - def putmask(self, mask, new, align: bool = True): if align: align_keys = ["new", "mask"] @@ -413,7 +386,7 @@ def putmask(self, mask, new, align: bool = True): aligned_kwargs = {k: kwargs[k] for k in align_keys} for i in range(len(self.arrays)): - + if not self._has_no_reference(i): # if being referenced -> perform Copy-on-Write and clear the reference self.arrays[i] = self.arrays[i].copy() diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py index 97ae0db23f519..53b05f4da0e7a 100644 --- a/pandas/tests/indexing/test_copy_on_write.py +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -285,6 +285,35 @@ def test_subset_set_with_row_indexer(indexer_si, indexer, using_array_manager): tm.assert_frame_equal(df, df_orig) +def test_subset_set_with_mask(using_array_manager): + # Case: setting values with a mask on a viewing subset: subset[mask] = value + df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) + df_orig = df.copy() + subset = df[1:4] + + mask = subset > 3 + + if using_array_manager: + subset[mask] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset[mask] = 0 + + expected = pd.DataFrame( + {"a": [2, 3, 0], "b": [0, 0, 0], "c": [0.20, 0.3, 0.4]}, index=range(1, 4) + ) + tm.assert_frame_equal(subset, expected) + if using_array_manager: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.loc[3, "a"] = 0 + df_orig.loc[1:3, "b"] = 0 + tm.assert_frame_equal(df, df_orig) + + def test_subset_set_column(using_array_manager): # Case: setting a single column on a viewing subset -> subset[col] = value df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) @@ -421,6 +450,38 @@ def test_series_getitem_slice(using_array_manager): assert s.iloc[0] == 0 +@pytest.mark.parametrize( + "indexer", + [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], + ids=["slice", "mask", "array"], +) +def test_series_subset_set_with_indexer(indexer_si, indexer, using_array_manager): + # Case: setting values in a viewing Series with an indexer + s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + subset = s[:] + + # if ( + # indexer_si is tm.setitem + # and isinstance(indexer, np.ndarray) + # and indexer.dtype == "int" + # ): + # pytest.skip("setitem with labels selects on columns") + + expected = pd.Series([0, 0, 3], index=["a", "b", "c"]) + indexer_si(subset)[indexer] = 0 + tm.assert_series_equal(subset, expected) + + if using_array_manager: + tm.assert_series_equal(s, s_orig) + else: + tm.assert_series_equal(s, expected) + + expected = pd.DataFrame( + {"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4) + ) + + # ----------------------------------------------------------------------------- # del operator From cc090019ebf5b65c25d548a2ef28a764e187c971 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Nov 2021 21:08:17 +0100 Subject: [PATCH 19/21] Update BlockManager expected results after recent behaviour changes --- pandas/tests/indexing/test_copy_on_write.py | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py index 53b05f4da0e7a..ce5c6e007a3d5 100644 --- a/pandas/tests/indexing/test_copy_on_write.py +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -331,11 +331,7 @@ def test_subset_set_column(using_array_manager): {"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) ) tm.assert_frame_equal(subset, expected) - if using_array_manager: - tm.assert_frame_equal(df, df_orig) - else: - df_orig.loc[1:2, "a"] = np.array([10, 11]) - tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df, df_orig) def test_subset_set_columns_single_block(using_array_manager): @@ -354,11 +350,7 @@ def test_subset_set_columns_single_block(using_array_manager): expected = pd.DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) tm.assert_frame_equal(subset, expected) - if using_array_manager: - tm.assert_frame_equal(df, df_orig) - else: - df_orig.loc[1:2, ["a", "c"]] = 0 - tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df, df_orig) def test_subset_set_columns_mixed_block(using_array_manager): @@ -377,13 +369,7 @@ def test_subset_set_columns_mixed_block(using_array_manager): expected = pd.DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) tm.assert_frame_equal(subset, expected) - if using_array_manager: - tm.assert_frame_equal(df, df_orig) - else: - # In the mixed case with BlockManager, only one of the two columns is - # mutated in the parent frame .. - df_orig.loc[1:2, ["a"]] = 0 - tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize( From 37e7ce0b108a56047b937605750bb68861fd73f4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 22 Nov 2021 11:29:47 +0100 Subject: [PATCH 20/21] limit changes to putmask for this PR --- pandas/core/array_algos/putmask.py | 134 ++----------------------- pandas/core/internals/array_manager.py | 35 ++----- 2 files changed, 16 insertions(+), 153 deletions(-) diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index 6dce202e5ca7f..1f37e0e5d249a 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -4,7 +4,6 @@ from __future__ import annotations from typing import Any -import warnings import numpy as np @@ -20,25 +19,9 @@ find_common_type, infer_dtype_from, ) -from pandas.core.dtypes.common import ( - is_float_dtype, - is_integer_dtype, - is_interval_dtype, - is_list_like, -) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndex, - ABCSeries, -) -from pandas.core.dtypes.missing import ( - is_valid_na_for_dtype, - isna_compat, - na_value_for_dtype, -) +from pandas.core.dtypes.common import is_list_like from pandas.core.arrays import ExtensionArray -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray def putmask_inplace(values: ArrayLike, mask: npt.NDArray[np.bool_], value: Any) -> None: @@ -87,7 +70,7 @@ def putmask_smart(values: np.ndarray, mask: npt.NDArray[np.bool_], new) -> np.nd `values`, updated in-place. mask : np.ndarray[bool] Applies to both sides (array like). - new : `new values` either scalar or an array like aligned with `values` + new : listlike `new values` aligned with `values` Returns ------- @@ -101,9 +84,6 @@ def putmask_smart(values: np.ndarray, mask: npt.NDArray[np.bool_], new) -> np.nd # we cannot use np.asarray() here as we cannot have conversions # that numpy does when numeric are mixed with strings - if not is_list_like(new): - new = np.broadcast_to(new, mask.shape) - # see if we are only masking values that if putted # will work in the current dtype try: @@ -112,29 +92,14 @@ def putmask_smart(values: np.ndarray, mask: npt.NDArray[np.bool_], new) -> np.nd # TypeError: only integer scalar arrays can be converted to a scalar index pass else: - # make sure that we have a nullable type if we have nulls - if not isna_compat(values, nn[0]): - pass - elif not (is_float_dtype(nn.dtype) or is_integer_dtype(nn.dtype)): - # only compare integers/floats - pass - elif not (is_float_dtype(values.dtype) or is_integer_dtype(values.dtype)): - # only compare integers/floats - pass - else: - - # we ignore ComplexWarning here - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", np.ComplexWarning) - nn_at = nn.astype(values.dtype) + # We only get to putmask_smart when we cannot hold 'new' in values. + # The "smart" part of putmask_smart is checking if we can hold new[mask] + # in values, in which case we can still avoid the need to cast. + if can_hold_element(values, nn): + values[mask] = nn + return values - comp = nn == nn_at - if is_list_like(comp) and comp.all(): - nv = values.copy() - nv[mask] = nn_at - return nv - - # new = np.asarray(new) + new = np.asarray(new) if values.dtype.kind == new.dtype.kind: # preserves dtype if possible @@ -237,84 +202,3 @@ def setitem_datetimelike_compat(values: np.ndarray, num_set: int, other): other = list(other) return other - - -def putmask_flexible_ndarray(array: np.ndarray, mask, new): - """ - Putmask implementation for ArrayManager putmask for ndarray. - - Flexible version that will upcast if needed. - """ - mask, noop = validate_putmask(array, mask) - assert not isinstance(new, (ABCIndex, ABCSeries, ABCDataFrame)) - - # if we are passed a scalar None, convert it here - if not array.dtype == "object" and is_valid_na_for_dtype(new, array.dtype): - new = na_value_for_dtype(array.dtype, compat=False) - - if can_hold_element(array, new): - # error: Argument 1 to "putmask_without_repeat" has incompatible type - # "Union[ndarray, ExtensionArray]"; expected "ndarray" - putmask_without_repeat(array, mask, new) # type: ignore[arg-type] - return array - - elif noop: - return array - - dtype, _ = infer_dtype_from(new) - if dtype.kind in ["m", "M"]: - array = array.astype(object) - # convert to list to avoid numpy coercing datetimelikes to integers - new = setitem_datetimelike_compat( - array, mask.sum(), new # type: ignore[arg-type] - ) - # putmask_smart below converts it back to array - np.putmask(array, mask, new) - return array - - new_values = putmask_smart(array, mask, new) - return new_values - - -def _coerce_to_target_dtype(array, new): - dtype, _ = infer_dtype_from(new, pandas_dtype=True) - new_dtype = find_common_type([array.dtype, dtype]) - return array.astype(new_dtype, copy=False) - - -def putmask_flexible_ea(array: ExtensionArray, mask, new): - """ - Putmask implementation for ArrayManager putmask for EA. - - Flexible version that will upcast if needed. - """ - mask = extract_bool_array(mask) - - if isinstance(array, NDArrayBackedExtensionArray): - - if not can_hold_element(array, new): - array = _coerce_to_target_dtype(array, new) - return putmask_flexible_ndarray(array, mask, new) - - array.putmask(mask, new) - return array - - if isinstance(new, (np.ndarray, ExtensionArray)) and len(new) == len(mask): - new = new[mask] - - try: - array[mask] = new - except TypeError: - if not is_interval_dtype(array.dtype): - # Discussion about what we want to support in the general - # case GH#39584 - raise - - array = _coerce_to_target_dtype(array, new) - if array.dtype == np.dtype("object"): - # For now at least, only support casting e.g. - # Interval[int64]->Interval[float64], - raise - return putmask_flexible_ea(array, mask, new) - - return array diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 1e4f025b112a4..de03c04050de0 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -55,10 +55,6 @@ ) import pandas.core.algorithms as algos -from pandas.core.array_algos.putmask import ( - putmask_flexible_ea, - putmask_flexible_ndarray, -) from pandas.core.array_algos.quantile import quantile_compat from pandas.core.array_algos.take import take_1d from pandas.core.arrays import ( @@ -382,36 +378,19 @@ def putmask(self, mask, new, align: bool = True): align_keys = ["mask"] new = extract_array(new, extract_numpy=True) - kwargs = {"mask": mask, "new": new} - aligned_kwargs = {k: kwargs[k] for k in align_keys} - for i in range(len(self.arrays)): - if not self._has_no_reference(i): # if being referenced -> perform Copy-on-Write and clear the reference self.arrays[i] = self.arrays[i].copy() self._clear_reference(i) - for k, obj in aligned_kwargs.items(): - if isinstance(obj, (ABCSeries, ABCDataFrame)): - # The caller is responsible for ensuring that - # obj.axes[-1].equals(self.items) - if obj.ndim == 1: - kwargs[k] = obj._values - else: - kwargs[k] = obj.iloc[:, i]._values - else: - # otherwise we have an ndarray - if self.ndim == 2: - kwargs[k] = obj[i] - - arr = self.arrays[i] - if isinstance(arr, np.ndarray): - new = putmask_flexible_ndarray(arr, **kwargs) - else: - new = putmask_flexible_ea(arr, **kwargs) - self.arrays[i] = new - + new_mgr = self.apply_with_block( + "putmask", + align_keys=align_keys, + mask=mask, + new=new, + ) + self.arrays = new_mgr.arrays return self def diff(self: T, n: int, axis: int) -> T: From 0f28095bb6011fb3d9f7ce8232428b87fe08b832 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 26 Nov 2021 14:09:12 +0100 Subject: [PATCH 21/21] address feedback --- pandas/core/internals/array_manager.py | 33 +++++++------------ pandas/tests/frame/indexing/test_xs.py | 3 +- pandas/tests/frame/methods/test_cov_corr.py | 2 -- .../indexing/test_chaining_and_caching.py | 1 - pandas/tests/indexing/test_scalar.py | 10 +----- 5 files changed, 14 insertions(+), 35 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 52717e1fab39d..47abcf68243a2 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -171,7 +171,7 @@ def set_axis(self, axis: int, new_labels: Index) -> None: axis = self._normalize_axis(axis) self._axes[axis] = new_labels - def _has_no_reference(self, i: int): + def _has_no_reference(self, i: int) -> bool: """ Check for column `i` if has references. (whether it references another array or is itself being referenced) @@ -182,7 +182,7 @@ def _has_no_reference(self, i: int): self.arrays[i] ) == 0 - def _clear_reference(self, i: int): + def _clear_reference(self, i: int) -> None: """ Clear any reference for column `i`. """ @@ -539,8 +539,8 @@ def copy(self: T, deep=True) -> T: Parameters ---------- - deep : bool or string, default True - If False, return shallow copy (do not copy data) + deep : bool, string or None, default True + If False or None, return a shallow copy (do not copy data). If 'all', copy data and a deep copy of the index Returns @@ -567,14 +567,9 @@ def copy_func(ax): refs = None else: new_arrays = list(self.arrays) - refs = [weakref.ref(arr) for arr in self.arrays] + refs: list[weakref.ref | None] = [weakref.ref(arr) for arr in self.arrays] - # error: Argument 3 to "BaseArrayManager" has incompatible type - # "Optional[List[ReferenceType[Union[ndarray[Any, Any], ExtensionArray]]]]"; - # expected "Optional[List[Optional[ReferenceType[Any]]]]" [arg-type] - return type(self)( - new_arrays, new_axes, refs, verify_integrity=False # type: ignore[arg-type] - ) + return type(self)(new_arrays, new_axes, refs, verify_integrity=False) def reindex_indexer( self: T, @@ -619,8 +614,8 @@ def _reindex_indexer( axis : int fill_value : object, default None allow_dups : bool, default False - copy : bool, default True - + copy : bool or None, default True + If None, regard as False to get shallow copy. pandas-indexer with -1's only. """ @@ -792,7 +787,7 @@ def _verify_integrity(self) -> None: f"{arr.ndim} dimensions instead." ) if self.refs is not None: - if not len(self.refs) == n_columns: + if len(self.refs) != n_columns: raise ValueError( "Number of passed refs must equal the size of the column Index: " f"{len(self.refs)} refs vs {n_columns} columns." @@ -830,10 +825,11 @@ def fast_xs(self, loc: int) -> ArrayLike: def get_slice(self, slobj: slice, axis: int = 0) -> ArrayManager: axis = self._normalize_axis(axis) + refs: list[weakref.ref | None] if axis == 0: arrays = [arr[slobj] for arr in self.arrays] # slicing results in views -> track references to original arrays - # TODO possible to optimizate this with single ref to the full ArrayManager? + # TODO possible to optimize this with single ref to the full ArrayManager? refs = [weakref.ref(arr) for arr in self.arrays] elif axis == 1: arrays = self.arrays[slobj] @@ -843,12 +839,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> ArrayManager: new_axes = list(self._axes) new_axes[axis] = new_axes[axis]._getitem_slice(slobj) - # error: Argument 3 to "ArrayManager" has incompatible type - # "List[ReferenceType[Union[ndarray[Any, Any], ExtensionArray]]]"; - # expected "Optional[List[Optional[ReferenceType[Any]]]]" [arg-type] - return type(self)( - arrays, new_axes, refs, verify_integrity=False # type: ignore[arg-type] - ) + return type(self)(arrays, new_axes, refs, verify_integrity=False) def iget(self, i: int) -> SingleArrayManager: """ diff --git a/pandas/tests/frame/indexing/test_xs.py b/pandas/tests/frame/indexing/test_xs.py index 93b6a8c8b04ba..3a1f9d8a527d8 100644 --- a/pandas/tests/frame/indexing/test_xs.py +++ b/pandas/tests/frame/indexing/test_xs.py @@ -116,13 +116,12 @@ def test_xs_view(self, using_array_manager): dm = DataFrame(np.arange(20.0).reshape(4, 5), index=range(4), columns=range(5)) + dm.xs(2)[:] = 20 if using_array_manager: # INFO(ArrayManager) with ArrayManager getting a row as a view is # not possible - dm.xs(2)[:] = 20 assert not (dm.xs(2) == 20).any() else: - dm.xs(2)[:] = 20 assert (dm.xs(2) == 20).all() diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 64d5307eeb6f2..b551cfc500e7a 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -27,9 +27,7 @@ def test_cov(self, float_frame, float_string_frame): # with NAs frame = float_frame.copy() - # frame["A"][:5] = np.nan frame.iloc[:5, frame.columns.get_loc("A")] = np.nan - # frame["B"][5:10] = np.nan frame.iloc[5:10, frame.columns.get_loc("B")] = np.nan result = frame.cov(min_periods=len(frame) - 8) expected = frame.cov() diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py index 8eba27a907280..f3c06e04c81f6 100644 --- a/pandas/tests/indexing/test_chaining_and_caching.py +++ b/pandas/tests/indexing/test_chaining_and_caching.py @@ -419,7 +419,6 @@ def test_detect_chained_assignment_changing_dtype(self, using_array_manager): with pytest.raises(com.SettingWithCopyError, match=msg): df["C"][2] = "foo" else: - # breakpoint() # TODO(CoW) can we still warn here? # df.loc[2]["D"] = "foo" # df.loc[2]["C"] = "foo" diff --git a/pandas/tests/indexing/test_scalar.py b/pandas/tests/indexing/test_scalar.py index c4fc604bd8f3c..bcb76fb078e74 100644 --- a/pandas/tests/indexing/test_scalar.py +++ b/pandas/tests/indexing/test_scalar.py @@ -7,8 +7,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( DataFrame, Series, @@ -48,11 +46,7 @@ def _check(f, func, values=False): for f in [d["ints"], d["uints"], d["labels"], d["ts"], d["floats"]]: _check(f, "at") - # TODO(CoW) fix at/iat setting on a DataFrame - @pytest.mark.parametrize( - "kind", - ["series", pytest.param("frame", marks=td.skip_array_manager_invalid_test)], - ) + @pytest.mark.parametrize("kind", ["series", "frame"]) def test_at_and_iat_set(self, kind): def _check(f, func, values=False): @@ -220,8 +214,6 @@ def test_mixed_index_at_iat_loc_iloc_dataframe(self): with pytest.raises(KeyError, match="^3$"): df.loc[0, 3] - # TODO(CoW) fix at/iat setting on a DataFrame - @td.skip_array_manager_invalid_test def test_iat_setter_incompatible_assignment(self): # GH 23236 result = DataFrame({"a": [0, 1], "b": [4, 5]})