From 3dc662f06bb7f2711d6b57a22550840a6df2acc4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 May 2022 23:41:02 +0200 Subject: [PATCH 01/13] API/TST: add tests for new copy/view behaviour --- pandas/tests/indexing/test_copy_on_write.py | 628 ++++++++++++++++++++ 1 file changed, 628 insertions(+) create mode 100644 pandas/tests/indexing/test_copy_on_write.py diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py new file mode 100644 index 0000000000000..73fa48fcdcab3 --- /dev/null +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -0,0 +1,628 @@ +import numpy as np +import pytest + +import pandas as pd +import pandas._testing as tm +import pandas.core.common as com + + +def test_copy(using_copy_on_write): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_copy = df.copy() + + # the deep copy doesn't share memory + assert not np.may_share_memory(df_copy["a"].values, df["a"].values) + if using_copy_on_write: + assert df_copy._mgr.refs is None + + # mutating copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + + +def test_copy_shallow(using_copy_on_write): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_copy = df.copy(deep=False) + + # the shallow copy still shares memory + assert np.may_share_memory(df_copy["a"].values, df["a"].values) + if using_copy_on_write: + assert df_copy._mgr.refs is not None + + if using_copy_on_write: + # mutating shallow copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + # mutating triggered a copy-on-write -> no longer shares memory + assert not np.may_share_memory(df_copy["a"].values, df["a"].values) + # but still shares memory for the other columns/blocks + assert np.may_share_memory(df_copy["c"].values, df["c"].values) + else: + # mutating shallow copy does mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 0 + # and still shares memory + assert np.may_share_memory(df_copy["a"].values, df["a"].values) + + +# ----------------------------------------------------------------------------- +# DataFrame methods returning new DataFrame using shallow copy + + +def test_reset_index(using_copy_on_write): + # Case: resetting the index (i.e. adding a new column) + mutating the + # resulting dataframe + df = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=[10, 11, 12] + ) + df_orig = df.copy() + df2 = df.reset_index() + df2._mgr._verify_integrity() + + if using_copy_on_write: + # still shares memory (df2 is a shallow copy) + assert np.may_share_memory(df2["b"].values, df["b"].values) + assert np.may_share_memory(df2["c"].values, df["c"].values) + # mutating df2 triggers a copy-on-write for that column / block + df2.iloc[0, 2] = 0 + assert not np.may_share_memory(df2["b"].values, df["b"].values) + if using_copy_on_write: + assert np.may_share_memory(df2["c"].values, df["c"].values) + tm.assert_frame_equal(df, df_orig) + + +def test_rename_columns(using_copy_on_write): + # Case: renaming columns returns a new dataframe + # + afterwards modifying the result + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.rename(columns=str.upper) + + if using_copy_on_write: + assert np.may_share_memory(df2["A"].values, df["a"].values) + df2.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["A"].values, df["a"].values) + if using_copy_on_write: + assert np.may_share_memory(df2["C"].values, df["c"].values) + expected = pd.DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_rename_columns_modify_parent(using_array_manager, using_copy_on_write): + # Case: renaming columns returns a new dataframe + # + afterwards modifying the original (parent) dataframe + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df2 = df.rename(columns=str.upper) + df2_orig = df2.copy() + + if using_copy_on_write: + assert np.may_share_memory(df2["A"].values, df["a"].values) + else: + assert not np.may_share_memory(df2["A"].values, df["a"].values) + df.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["A"].values, df["a"].values) + if using_array_manager: + assert np.may_share_memory(df2["C"].values, df["c"].values) + expected = pd.DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df2, df2_orig) + + +def test_reindex_columns(using_copy_on_write): + # Case: reindexing the column returns a new dataframe + # + afterwards modifying the result + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.reindex(columns=["a", "c"]) + + if using_copy_on_write: + # still shares memory (df2 is a shallow copy) + assert np.may_share_memory(df2["a"].values, df["a"].values) + else: + assert not np.may_share_memory(df2["a"].values, df["a"].values) + # mutating df2 triggers a copy-on-write for that column + df2.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["a"].values, df["a"].values) + if using_copy_on_write: + assert np.may_share_memory(df2["c"].values, df["c"].values) + tm.assert_frame_equal(df, df_orig) + + +# # ----------------------------------------------------------------------------- +# # Indexing operations taking subset + modifying the subset/parent + + +def test_subset_column_selection(using_copy_on_write): + # Case: taking a subset of the columns of a DataFrame + # + afterwards modifying the subset + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + subset = df[["a", "c"]] + + if using_copy_on_write: + # the subset shares memory ... + assert np.may_share_memory(subset["a"].values, df["a"].values) + # ... but uses CoW when being modified + subset.iloc[0, 0] = 0 + else: + assert not np.may_share_memory(subset["a"].values, df["a"].values) + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.iloc[0, 0] = 0 + + assert not np.may_share_memory(subset["a"].values, df["a"].values) + + expected = pd.DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_subset_column_selection_modify_parent(using_copy_on_write): + # Case: taking a subset of the columns of a DataFrame + # + afterwards modifying the parent + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + + subset = df[["a", "c"]] + if using_copy_on_write: + # the subset shares memory ... + assert np.may_share_memory(subset["a"].values, df["a"].values) + # ... but parent uses CoW parent when it is modified + df.iloc[0, 0] = 0 + + assert not np.may_share_memory(subset["a"].values, df["a"].values) + if using_copy_on_write: + # different column/block still shares memory + assert np.may_share_memory(subset["c"].values, df["c"].values) + + expected = pd.DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(subset, expected) + + +def test_subset_row_slice(using_copy_on_write): + # Case: taking a subset of the rows of a DataFrame using a slice + # + afterwards modifying the subset + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + subset = df[1:3] + subset._mgr._verify_integrity() + + assert np.may_share_memory(subset["a"].values, df["a"].values) + + if using_copy_on_write: + subset.iloc[0, 0] = 0 + assert not np.may_share_memory(subset["a"].values, df["a"].values) + + else: + # INFO this no longer raise warning since pandas 1.4 + # with pd.option_context("chained_assignment", "warn"): + # with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.iloc[0, 0] = 0 + + subset._mgr._verify_integrity() + + expected = pd.DataFrame( + {"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + if using_copy_on_write: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.iloc[1, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_subset_column_slice(using_copy_on_write): + # Case: taking a subset of the columns of a DataFrame using a slice + # + afterwards modifying the subset + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + subset = df.iloc[:, 1:] + subset._mgr._verify_integrity() + + if using_copy_on_write: + assert np.may_share_memory(subset["b"].values, df["b"].values) + + subset.iloc[0, 0] = 0 + assert not np.may_share_memory(subset["b"].values, df["b"].values) + + else: + subset.iloc[0, 0] = 0 + + expected = pd.DataFrame({"b": [0, 5, 6], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(subset, expected) + # original parent dataframe is not modified (also not for BlockManager case) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "indexer", + [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], + ids=["slice", "mask", "array"], +) +def test_subset_set_with_row_indexer(indexer_si, indexer, using_copy_on_write): + # Case: setting values with a row indexer on a viewing subset + # subset[indexer] = value and subset.iloc[indexer] = value + df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) + df_orig = df.copy() + subset = df[1:4] + + if ( + indexer_si is tm.setitem + and isinstance(indexer, np.ndarray) + and indexer.dtype == "int" + ): + pytest.skip("setitem with labels selects on columns") + + if using_copy_on_write: + indexer_si(subset)[indexer] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + indexer_si(subset)[indexer] = 0 + + expected = pd.DataFrame( + {"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4) + ) + tm.assert_frame_equal(subset, expected) + if using_copy_on_write: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig[1:3] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_with_mask(using_copy_on_write): + # Case: setting values with a mask on a viewing subset: subset[mask] = value + df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) + df_orig = df.copy() + subset = df[1:4] + + mask = subset > 3 + + if using_copy_on_write: + subset[mask] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset[mask] = 0 + + expected = pd.DataFrame( + {"a": [2, 3, 0], "b": [0, 0, 0], "c": [0.20, 0.3, 0.4]}, index=range(1, 4) + ) + tm.assert_frame_equal(subset, expected) + if using_copy_on_write: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.loc[3, "a"] = 0 + df_orig.loc[1:3, "b"] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_column(using_copy_on_write): + # Case: setting a single column on a viewing subset -> subset[col] = value + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + subset = df[1:3] + + if using_copy_on_write: + subset["a"] = np.array([10, 11], dtype="int64") + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset["a"] = np.array([10, 11], dtype="int64") + + subset._mgr._verify_integrity() + expected = pd.DataFrame( + {"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_subset_set_column_with_loc(using_copy_on_write, dtype): + # Case: setting a single column with loc on a viewing subset + # -> subset.loc[:, col] = value + df = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + subset = df[1:3] + + if using_copy_on_write: + subset.loc[:, "a"] = np.array([10, 11], dtype="int64") + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.loc[:, "a"] = np.array([10, 11], dtype="int64") + + subset._mgr._verify_integrity() + expected = pd.DataFrame( + {"a": [10, 11], "b": [5, 6], "c": np.array([8, 9], dtype=dtype)}, + index=range(1, 3), + ) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_subset_set_column_with_loc2(using_copy_on_write): + # Case: setting a single column with loc on a viewing subset + # -> subset.loc[:, col] = value + # separate test for case of DataFrame of a single column -> takes a separate + # code path + df = pd.DataFrame({"a": [1, 2, 3]}) + df_orig = df.copy() + subset = df[1:3] + + if using_copy_on_write: + subset.loc[:, "a"] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.loc[:, "a"] = 0 + + subset._mgr._verify_integrity() + expected = pd.DataFrame({"a": [0, 0]}, index=range(1, 3)) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_subset_set_columns(using_copy_on_write, dtype): + # Case: setting multiple columns on a viewing subset + # -> subset[[col1, col2]] = value + df = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + subset = df[1:3] + + if using_copy_on_write: + subset[["a", "c"]] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset[["a", "c"]] = 0 + + subset._mgr._verify_integrity() + if using_copy_on_write: + # first and third column should certainly have no references anymore + assert all(subset._mgr._has_no_reference(i) for i in [0, 2]) + expected = pd.DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) + tm.assert_frame_equal(subset, expected) + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "indexer", + [slice("a", "b"), np.array([True, True, False]), ["a", "b"]], + ids=["slice", "mask", "array"], +) +def test_subset_set_with_column_indexer(indexer, using_copy_on_write): + # Case: setting multiple columns with a column indexer on a viewing subset + # -> subset.loc[:, [col1, col2]] = value + df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + df_orig = df.copy() + subset = df[1:3] + + if using_copy_on_write: + subset.loc[:, indexer] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.loc[:, indexer] = 0 + + subset._mgr._verify_integrity() + expected = pd.DataFrame( + {"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3) + ) + # TODO full row slice .loc[:, idx] update inplace instead of overwrite? + expected["b"] = expected["b"].astype("int64") + tm.assert_frame_equal(subset, expected) + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + else: + # In the mixed case with BlockManager, only one of the two columns is + # mutated in the parent frame .. + df_orig.loc[1:2, ["a"]] = 0 + tm.assert_frame_equal(df, df_orig) + + +# TODO add more tests modifying the parent + + +# ----------------------------------------------------------------------------- +# Series -- Indexing operations taking subset + modifying the subset/parent + + +def test_series_getitem_slice(using_copy_on_write): + # Case: taking a slice of a Series + afterwards modifying the subset + s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + + subset = s[:] + assert np.may_share_memory(subset.values, s.values) + + subset.iloc[0] = 0 + + if using_copy_on_write: + assert not np.may_share_memory(subset.values, s.values) + + expected = pd.Series([0, 2, 3], index=["a", "b", "c"]) + tm.assert_series_equal(subset, expected) + + if using_copy_on_write: + # original parent series is not modified (CoW) + tm.assert_series_equal(s, s_orig) + else: + # original parent series is actually updated + assert s.iloc[0] == 0 + + +@pytest.mark.parametrize( + "indexer", + [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], + ids=["slice", "mask", "array"], +) +def test_series_subset_set_with_indexer(indexer_si, indexer, using_copy_on_write): + # Case: setting values in a viewing Series with an indexer + s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + subset = s[:] + + indexer_si(subset)[indexer] = 0 + expected = pd.Series([0, 0, 3], index=["a", "b", "c"]) + tm.assert_series_equal(subset, expected) + + if using_copy_on_write: + tm.assert_series_equal(s, s_orig) + else: + tm.assert_series_equal(s, expected) + + +# ----------------------------------------------------------------------------- +# del operator + + +def test_del_frame(using_copy_on_write): + # Case: deleting a column with `del` on a viewing child dataframe should + # not modify parent + update the references + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df[:] + + assert np.may_share_memory(df["a"].values, df2["a"].values) + + del df2["b"] + + assert np.may_share_memory(df["a"].values, df2["a"].values) + tm.assert_frame_equal(df, df_orig) + tm.assert_frame_equal(df2, df_orig[["a", "c"]]) + df2._mgr._verify_integrity() + + # TODO in theory modifying column "b" of the parent wouldn't need a CoW + # but the weakref is still alive and so we still perform CoW + + if using_copy_on_write: + # modifying child after deleting a column still doesn't update parent + df2.loc[0, "a"] = 100 + tm.assert_frame_equal(df, df_orig) + + +def test_del_series(): + s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s_orig = s.copy() + s2 = s[:] + + assert np.may_share_memory(s.values, s2.values) + + del s2["a"] + + assert not np.may_share_memory(s.values, s2.values) + tm.assert_series_equal(s, s_orig) + tm.assert_series_equal(s2, s_orig[["b", "c"]]) + + # modifying s2 doesn't need copy on write (due to `del`, s2 is backed by new array) + values = s2.values + s2.loc["b"] = 100 + assert values[0] == 100 + + +# ----------------------------------------------------------------------------- +# Accessing column as Series + + +def test_column_as_series(using_copy_on_write): + # Case: selecting a single column now also uses Copy-on-Write + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + s = df["a"] + + assert np.may_share_memory(s.values, df["a"].values) + + if using_copy_on_write: + s[0] = 0 + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + s[0] = 0 + + expected = pd.Series([0, 2, 3], name="a") + tm.assert_series_equal(s, expected) + if using_copy_on_write: + # assert not np.may_share_memory(s.values, df["a"].values) + tm.assert_frame_equal(df, df_orig) + # ensure cached series on getitem is not the changed series + tm.assert_series_equal(df["a"], df_orig["a"]) + else: + df_orig.iloc[0, 0] = 0 + tm.assert_frame_equal(df, df_orig) + + +def test_column_as_series_set_with_upcast(using_copy_on_write): + # Case: selecting a single column now also uses Copy-on-Write -> when + # setting a value causes an upcast, we don't need to update the parent + # DataFrame through the cache mechanism + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + + s = df["a"] + if using_copy_on_write: + s[0] = "foo" + else: + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(com.SettingWithCopyWarning): + s[0] = "foo" + + expected = pd.Series(["foo", 2, 3], dtype=object, name="a") + tm.assert_series_equal(s, expected) + if using_copy_on_write: + tm.assert_frame_equal(df, df_orig) + # ensure cached series on getitem is not the changed series + tm.assert_series_equal(df["a"], df_orig["a"]) + else: + df_orig["a"] = expected + tm.assert_frame_equal(df, df_orig) + + +# TODO add tests for other indexing methods on the Series + + +def test_dataframe_add_column_from_series(): + # Case: adding a new column to a DataFrame from an existing column/series + # -> always already takes a copy on assignment + # (no change in behaviour here) + # TODO can we achieve the same behaviour with Copy-on-Write? + df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + + s = pd.Series([10, 11, 12]) + df["new"] = s + assert not np.may_share_memory(df["new"].values, s.values) + + # editing series -> doesn't modify column in frame + s[0] = 0 + expected = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]}) + tm.assert_frame_equal(df, expected) + + # editing column in frame -> doesn't modify series + df.loc[2, "new"] = 100 + expected = pd.Series([0, 11, 12]) + tm.assert_series_equal(s, expected) + + +# TODO add tests for constructors From 70143c4f421f69ed1963661096638597a5f8042f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 May 2022 23:54:44 +0200 Subject: [PATCH 02/13] fixes for non-CoW cases --- pandas/conftest.py | 8 ++++++ pandas/tests/indexing/test_copy_on_write.py | 31 ++++++++++++++++----- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index b84d6fc9c2b99..9865aa1704fd6 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1798,3 +1798,11 @@ def using_array_manager(): Fixture to check if the array manager is being used. """ return pd.options.mode.data_manager == "array" + + +@pytest.fixture +def using_copy_on_write(): + """ + Fixture to check if Copy-on-Write is enabled. + """ + return False diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/indexing/test_copy_on_write.py index 73fa48fcdcab3..70fdc12f53c0d 100644 --- a/pandas/tests/indexing/test_copy_on_write.py +++ b/pandas/tests/indexing/test_copy_on_write.py @@ -148,9 +148,10 @@ def test_subset_column_selection(using_copy_on_write): subset.iloc[0, 0] = 0 else: assert not np.may_share_memory(subset["a"].values, df["a"].values) - with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(com.SettingWithCopyWarning): - subset.iloc[0, 0] = 0 + # INFO this no longer raise warning since pandas 1.4 + # with pd.option_context("chained_assignment", "warn"): + # with tm.assert_produces_warning(com.SettingWithCopyWarning): + subset.iloc[0, 0] = 0 assert not np.may_share_memory(subset["a"].values, df["a"].values) @@ -262,8 +263,10 @@ def test_subset_set_with_row_indexer(indexer_si, indexer, using_copy_on_write): if using_copy_on_write: indexer_si(subset)[indexer] = 0 else: + # INFO iloc no longer raises warning since pandas 1.4 + warn = com.SettingWithCopyWarning if indexer_si is tm.setitem else None with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(com.SettingWithCopyWarning): + with tm.assert_produces_warning(warn): indexer_si(subset)[indexer] = 0 expected = pd.DataFrame( @@ -354,7 +357,13 @@ def test_subset_set_column_with_loc(using_copy_on_write, dtype): index=range(1, 3), ) tm.assert_frame_equal(subset, expected) - tm.assert_frame_equal(df, df_orig) + if using_copy_on_write: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.loc[1:3, "a"] = np.array([10, 11], dtype="int64") + tm.assert_frame_equal(df, df_orig) def test_subset_set_column_with_loc2(using_copy_on_write): @@ -376,7 +385,13 @@ def test_subset_set_column_with_loc2(using_copy_on_write): subset._mgr._verify_integrity() expected = pd.DataFrame({"a": [0, 0]}, index=range(1, 3)) tm.assert_frame_equal(subset, expected) - tm.assert_frame_equal(df, df_orig) + if using_copy_on_write: + # original parent dataframe is not modified (CoW) + tm.assert_frame_equal(df, df_orig) + else: + # original parent dataframe is actually updated + df_orig.loc[1:3, "a"] = 0 + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize( @@ -517,10 +532,12 @@ def test_del_frame(using_copy_on_write): # TODO in theory modifying column "b" of the parent wouldn't need a CoW # but the weakref is still alive and so we still perform CoW + df2.loc[0, "a"] = 100 if using_copy_on_write: # modifying child after deleting a column still doesn't update parent - df2.loc[0, "a"] = 100 tm.assert_frame_equal(df, df_orig) + else: + assert df.loc[0, "a"] == 100 def test_del_series(): From e25e2db8856f50f431a2a15cc082930208a639a8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 May 2022 12:26:40 +0200 Subject: [PATCH 03/13] move test file to tests/copy_view --- pandas/tests/{indexing => copy_view}/test_copy_on_write.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pandas/tests/{indexing => copy_view}/test_copy_on_write.py (100%) diff --git a/pandas/tests/indexing/test_copy_on_write.py b/pandas/tests/copy_view/test_copy_on_write.py similarity index 100% rename from pandas/tests/indexing/test_copy_on_write.py rename to pandas/tests/copy_view/test_copy_on_write.py From 0ef1f9a939665dbbb387d0c4ddaf8db4fb6c12ee Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 May 2022 12:29:45 +0200 Subject: [PATCH 04/13] split into multiple files --- ...test_copy_on_write.py => test_indexing.py} | 126 +---------------- pandas/tests/copy_view/test_methods.py | 127 ++++++++++++++++++ 2 files changed, 128 insertions(+), 125 deletions(-) rename pandas/tests/copy_view/{test_copy_on_write.py => test_indexing.py} (78%) create mode 100644 pandas/tests/copy_view/test_methods.py diff --git a/pandas/tests/copy_view/test_copy_on_write.py b/pandas/tests/copy_view/test_indexing.py similarity index 78% rename from pandas/tests/copy_view/test_copy_on_write.py rename to pandas/tests/copy_view/test_indexing.py index 70fdc12f53c0d..fcc3235dd16b7 100644 --- a/pandas/tests/copy_view/test_copy_on_write.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -5,132 +5,8 @@ import pandas._testing as tm import pandas.core.common as com - -def test_copy(using_copy_on_write): - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) - df_copy = df.copy() - - # the deep copy doesn't share memory - assert not np.may_share_memory(df_copy["a"].values, df["a"].values) - if using_copy_on_write: - assert df_copy._mgr.refs is None - - # mutating copy doesn't mutate original - df_copy.iloc[0, 0] = 0 - assert df.iloc[0, 0] == 1 - - -def test_copy_shallow(using_copy_on_write): - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) - df_copy = df.copy(deep=False) - - # the shallow copy still shares memory - assert np.may_share_memory(df_copy["a"].values, df["a"].values) - if using_copy_on_write: - assert df_copy._mgr.refs is not None - - if using_copy_on_write: - # mutating shallow copy doesn't mutate original - df_copy.iloc[0, 0] = 0 - assert df.iloc[0, 0] == 1 - # mutating triggered a copy-on-write -> no longer shares memory - assert not np.may_share_memory(df_copy["a"].values, df["a"].values) - # but still shares memory for the other columns/blocks - assert np.may_share_memory(df_copy["c"].values, df["c"].values) - else: - # mutating shallow copy does mutate original - df_copy.iloc[0, 0] = 0 - assert df.iloc[0, 0] == 0 - # and still shares memory - assert np.may_share_memory(df_copy["a"].values, df["a"].values) - - # ----------------------------------------------------------------------------- -# DataFrame methods returning new DataFrame using shallow copy - - -def test_reset_index(using_copy_on_write): - # Case: resetting the index (i.e. adding a new column) + mutating the - # resulting dataframe - df = pd.DataFrame( - {"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=[10, 11, 12] - ) - df_orig = df.copy() - df2 = df.reset_index() - df2._mgr._verify_integrity() - - if using_copy_on_write: - # still shares memory (df2 is a shallow copy) - assert np.may_share_memory(df2["b"].values, df["b"].values) - assert np.may_share_memory(df2["c"].values, df["c"].values) - # mutating df2 triggers a copy-on-write for that column / block - df2.iloc[0, 2] = 0 - assert not np.may_share_memory(df2["b"].values, df["b"].values) - if using_copy_on_write: - assert np.may_share_memory(df2["c"].values, df["c"].values) - tm.assert_frame_equal(df, df_orig) - - -def test_rename_columns(using_copy_on_write): - # Case: renaming columns returns a new dataframe - # + afterwards modifying the result - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) - df_orig = df.copy() - df2 = df.rename(columns=str.upper) - - if using_copy_on_write: - assert np.may_share_memory(df2["A"].values, df["a"].values) - df2.iloc[0, 0] = 0 - assert not np.may_share_memory(df2["A"].values, df["a"].values) - if using_copy_on_write: - assert np.may_share_memory(df2["C"].values, df["c"].values) - expected = pd.DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) - tm.assert_frame_equal(df2, expected) - tm.assert_frame_equal(df, df_orig) - - -def test_rename_columns_modify_parent(using_array_manager, using_copy_on_write): - # Case: renaming columns returns a new dataframe - # + afterwards modifying the original (parent) dataframe - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) - df2 = df.rename(columns=str.upper) - df2_orig = df2.copy() - - if using_copy_on_write: - assert np.may_share_memory(df2["A"].values, df["a"].values) - else: - assert not np.may_share_memory(df2["A"].values, df["a"].values) - df.iloc[0, 0] = 0 - assert not np.may_share_memory(df2["A"].values, df["a"].values) - if using_array_manager: - assert np.may_share_memory(df2["C"].values, df["c"].values) - expected = pd.DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) - tm.assert_frame_equal(df, expected) - tm.assert_frame_equal(df2, df2_orig) - - -def test_reindex_columns(using_copy_on_write): - # Case: reindexing the column returns a new dataframe - # + afterwards modifying the result - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) - df_orig = df.copy() - df2 = df.reindex(columns=["a", "c"]) - - if using_copy_on_write: - # still shares memory (df2 is a shallow copy) - assert np.may_share_memory(df2["a"].values, df["a"].values) - else: - assert not np.may_share_memory(df2["a"].values, df["a"].values) - # mutating df2 triggers a copy-on-write for that column - df2.iloc[0, 0] = 0 - assert not np.may_share_memory(df2["a"].values, df["a"].values) - if using_copy_on_write: - assert np.may_share_memory(df2["c"].values, df["c"].values) - tm.assert_frame_equal(df, df_orig) - - -# # ----------------------------------------------------------------------------- -# # Indexing operations taking subset + modifying the subset/parent +# Indexing operations taking subset + modifying the subset/parent def test_subset_column_selection(using_copy_on_write): diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py new file mode 100644 index 0000000000000..11a7ae6b56b56 --- /dev/null +++ b/pandas/tests/copy_view/test_methods.py @@ -0,0 +1,127 @@ +import numpy as np + +import pandas as pd +import pandas._testing as tm + + +def test_copy(using_copy_on_write): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_copy = df.copy() + + # the deep copy doesn't share memory + assert not np.may_share_memory(df_copy["a"].values, df["a"].values) + if using_copy_on_write: + assert df_copy._mgr.refs is None + + # mutating copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + + +def test_copy_shallow(using_copy_on_write): + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_copy = df.copy(deep=False) + + # the shallow copy still shares memory + assert np.may_share_memory(df_copy["a"].values, df["a"].values) + if using_copy_on_write: + assert df_copy._mgr.refs is not None + + if using_copy_on_write: + # mutating shallow copy doesn't mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 1 + # mutating triggered a copy-on-write -> no longer shares memory + assert not np.may_share_memory(df_copy["a"].values, df["a"].values) + # but still shares memory for the other columns/blocks + assert np.may_share_memory(df_copy["c"].values, df["c"].values) + else: + # mutating shallow copy does mutate original + df_copy.iloc[0, 0] = 0 + assert df.iloc[0, 0] == 0 + # and still shares memory + assert np.may_share_memory(df_copy["a"].values, df["a"].values) + + +# ----------------------------------------------------------------------------- +# DataFrame methods returning new DataFrame using shallow copy + + +def test_reset_index(using_copy_on_write): + # Case: resetting the index (i.e. adding a new column) + mutating the + # resulting dataframe + df = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=[10, 11, 12] + ) + df_orig = df.copy() + df2 = df.reset_index() + df2._mgr._verify_integrity() + + if using_copy_on_write: + # still shares memory (df2 is a shallow copy) + assert np.may_share_memory(df2["b"].values, df["b"].values) + assert np.may_share_memory(df2["c"].values, df["c"].values) + # mutating df2 triggers a copy-on-write for that column / block + df2.iloc[0, 2] = 0 + assert not np.may_share_memory(df2["b"].values, df["b"].values) + if using_copy_on_write: + assert np.may_share_memory(df2["c"].values, df["c"].values) + tm.assert_frame_equal(df, df_orig) + + +def test_rename_columns(using_copy_on_write): + # Case: renaming columns returns a new dataframe + # + afterwards modifying the result + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.rename(columns=str.upper) + + if using_copy_on_write: + assert np.may_share_memory(df2["A"].values, df["a"].values) + df2.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["A"].values, df["a"].values) + if using_copy_on_write: + assert np.may_share_memory(df2["C"].values, df["c"].values) + expected = pd.DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(df2, expected) + tm.assert_frame_equal(df, df_orig) + + +def test_rename_columns_modify_parent(using_array_manager, using_copy_on_write): + # Case: renaming columns returns a new dataframe + # + afterwards modifying the original (parent) dataframe + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df2 = df.rename(columns=str.upper) + df2_orig = df2.copy() + + if using_copy_on_write: + assert np.may_share_memory(df2["A"].values, df["a"].values) + else: + assert not np.may_share_memory(df2["A"].values, df["a"].values) + df.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["A"].values, df["a"].values) + if using_array_manager: + assert np.may_share_memory(df2["C"].values, df["c"].values) + expected = pd.DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + tm.assert_frame_equal(df, expected) + tm.assert_frame_equal(df2, df2_orig) + + +def test_reindex_columns(using_copy_on_write): + # Case: reindexing the column returns a new dataframe + # + afterwards modifying the result + df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df_orig = df.copy() + df2 = df.reindex(columns=["a", "c"]) + + if using_copy_on_write: + # still shares memory (df2 is a shallow copy) + assert np.may_share_memory(df2["a"].values, df["a"].values) + else: + assert not np.may_share_memory(df2["a"].values, df["a"].values) + # mutating df2 triggers a copy-on-write for that column + df2.iloc[0, 0] = 0 + assert not np.may_share_memory(df2["a"].values, df["a"].values) + if using_copy_on_write: + assert np.may_share_memory(df2["c"].values, df["c"].values) + tm.assert_frame_equal(df, df_orig) From 4a528f6db4cc3ef127bac45a7bc673f8e94e2336 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 May 2022 12:48:11 +0200 Subject: [PATCH 05/13] Fix tests for ArrayManager case --- pandas/tests/copy_view/test_indexing.py | 49 +++++++++++++++++-------- pandas/tests/copy_view/test_methods.py | 4 +- 2 files changed, 35 insertions(+), 18 deletions(-) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index fcc3235dd16b7..ecfedeaac15f8 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -93,10 +93,16 @@ def test_subset_row_slice(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_subset_column_slice(using_copy_on_write): +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +def test_subset_column_slice(using_copy_on_write, using_array_manager, dtype): # Case: taking a subset of the columns of a DataFrame using a slice # + afterwards modifying the subset - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + single_block = (dtype == "int64") and not using_array_manager + df = pd.DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) df_orig = df.copy() subset = df.iloc[:, 1:] @@ -109,12 +115,21 @@ def test_subset_column_slice(using_copy_on_write): assert not np.may_share_memory(subset["b"].values, df["b"].values) else: - subset.iloc[0, 0] = 0 + # we only get a warning in case of a single block + warn = com.SettingWithCopyWarning if single_block else None + with pd.option_context("chained_assignment", "warn"): + with tm.assert_produces_warning(warn): + subset.iloc[0, 0] = 0 - expected = pd.DataFrame({"b": [0, 5, 6], "c": [0.1, 0.2, 0.3]}) + expected = pd.DataFrame({"b": [0, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}) tm.assert_frame_equal(subset, expected) - # original parent dataframe is not modified (also not for BlockManager case) - tm.assert_frame_equal(df, df_orig) + # original parent dataframe is not modified (also not for BlockManager case, + # except for single block) + if not using_copy_on_write and (using_array_manager or single_block): + df_orig.iloc[0, 1] = 0 + tm.assert_frame_equal(df, df_orig) + else: + tm.assert_frame_equal(df, df_orig) @pytest.mark.parametrize( @@ -211,7 +226,7 @@ def test_subset_set_column(using_copy_on_write): @pytest.mark.parametrize( "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] ) -def test_subset_set_column_with_loc(using_copy_on_write, dtype): +def test_subset_set_column_with_loc(using_copy_on_write, using_array_manager, dtype): # Case: setting a single column with loc on a viewing subset # -> subset.loc[:, col] = value df = pd.DataFrame( @@ -233,7 +248,7 @@ def test_subset_set_column_with_loc(using_copy_on_write, dtype): index=range(1, 3), ) tm.assert_frame_equal(subset, expected) - if using_copy_on_write: + if using_copy_on_write or using_array_manager: # original parent dataframe is not modified (CoW) tm.assert_frame_equal(df, df_orig) else: @@ -242,7 +257,7 @@ def test_subset_set_column_with_loc(using_copy_on_write, dtype): tm.assert_frame_equal(df, df_orig) -def test_subset_set_column_with_loc2(using_copy_on_write): +def test_subset_set_column_with_loc2(using_copy_on_write, using_array_manager): # Case: setting a single column with loc on a viewing subset # -> subset.loc[:, col] = value # separate test for case of DataFrame of a single column -> takes a separate @@ -261,7 +276,7 @@ def test_subset_set_column_with_loc2(using_copy_on_write): subset._mgr._verify_integrity() expected = pd.DataFrame({"a": [0, 0]}, index=range(1, 3)) tm.assert_frame_equal(subset, expected) - if using_copy_on_write: + if using_copy_on_write or using_array_manager: # original parent dataframe is not modified (CoW) tm.assert_frame_equal(df, df_orig) else: @@ -303,7 +318,9 @@ def test_subset_set_columns(using_copy_on_write, dtype): [slice("a", "b"), np.array([True, True, False]), ["a", "b"]], ids=["slice", "mask", "array"], ) -def test_subset_set_with_column_indexer(indexer, using_copy_on_write): +def test_subset_set_with_column_indexer( + indexer, using_copy_on_write, using_array_manager +): # Case: setting multiple columns with a column indexer on a viewing subset # -> subset.loc[:, [col1, col2]] = value df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) @@ -324,7 +341,7 @@ def test_subset_set_with_column_indexer(indexer, using_copy_on_write): # TODO full row slice .loc[:, idx] update inplace instead of overwrite? expected["b"] = expected["b"].astype("int64") tm.assert_frame_equal(subset, expected) - if using_copy_on_write: + if using_copy_on_write or using_array_manager: tm.assert_frame_equal(df, df_orig) else: # In the mixed case with BlockManager, only one of the two columns is @@ -439,7 +456,7 @@ def test_del_series(): # Accessing column as Series -def test_column_as_series(using_copy_on_write): +def test_column_as_series(using_copy_on_write, using_array_manager): # Case: selecting a single column now also uses Copy-on-Write df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() @@ -448,7 +465,7 @@ def test_column_as_series(using_copy_on_write): assert np.may_share_memory(s.values, df["a"].values) - if using_copy_on_write: + if using_copy_on_write or using_array_manager: s[0] = 0 else: with pd.option_context("chained_assignment", "warn"): @@ -467,7 +484,7 @@ def test_column_as_series(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_column_as_series_set_with_upcast(using_copy_on_write): +def test_column_as_series_set_with_upcast(using_copy_on_write, using_array_manager): # Case: selecting a single column now also uses Copy-on-Write -> when # setting a value causes an upcast, we don't need to update the parent # DataFrame through the cache mechanism @@ -475,7 +492,7 @@ def test_column_as_series_set_with_upcast(using_copy_on_write): df_orig = df.copy() s = df["a"] - if using_copy_on_write: + if using_copy_on_write or using_array_manager: s[0] = "foo" else: with pd.option_context("chained_assignment", "warn"): diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 11a7ae6b56b56..a7840b485f3d0 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -87,7 +87,7 @@ def test_rename_columns(using_copy_on_write): tm.assert_frame_equal(df, df_orig) -def test_rename_columns_modify_parent(using_array_manager, using_copy_on_write): +def test_rename_columns_modify_parent(using_copy_on_write): # Case: renaming columns returns a new dataframe # + afterwards modifying the original (parent) dataframe df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) @@ -100,7 +100,7 @@ def test_rename_columns_modify_parent(using_array_manager, using_copy_on_write): assert not np.may_share_memory(df2["A"].values, df["a"].values) df.iloc[0, 0] = 0 assert not np.may_share_memory(df2["A"].values, df["a"].values) - if using_array_manager: + if using_copy_on_write: assert np.may_share_memory(df2["C"].values, df["c"].values) expected = pd.DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(df, expected) From 2e7080e063d5957a3c9992e00ec28b0de1ff402f Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 11 May 2022 17:20:11 +0200 Subject: [PATCH 06/13] fix type checking --- pandas/tests/copy_view/test_indexing.py | 4 ++-- pandas/tests/copy_view/test_methods.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index ecfedeaac15f8..6dd2316ea17ca 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -531,8 +531,8 @@ def test_dataframe_add_column_from_series(): # editing column in frame -> doesn't modify series df.loc[2, "new"] = 100 - expected = pd.Series([0, 11, 12]) - tm.assert_series_equal(s, expected) + expected_s = pd.Series([0, 11, 12]) + tm.assert_series_equal(s, expected_s) # TODO add tests for constructors diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index a7840b485f3d0..35efcc465b6b9 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -11,7 +11,7 @@ def test_copy(using_copy_on_write): # the deep copy doesn't share memory assert not np.may_share_memory(df_copy["a"].values, df["a"].values) if using_copy_on_write: - assert df_copy._mgr.refs is None + assert df_copy._mgr.refs is None # type: ignore[union-attr] # mutating copy doesn't mutate original df_copy.iloc[0, 0] = 0 @@ -25,7 +25,7 @@ def test_copy_shallow(using_copy_on_write): # the shallow copy still shares memory assert np.may_share_memory(df_copy["a"].values, df["a"].values) if using_copy_on_write: - assert df_copy._mgr.refs is not None + assert df_copy._mgr.refs is not None # type: ignore[union-attr] if using_copy_on_write: # mutating shallow copy doesn't mutate original From 95cfdd54d1f928281b8493b5d5c1ebb84366ed1c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 May 2022 08:34:57 +0200 Subject: [PATCH 07/13] may_share_memory -> shares_memory --- pandas/tests/copy_view/test_indexing.py | 38 +++++++++++------------ pandas/tests/copy_view/test_methods.py | 40 ++++++++++++------------- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 6dd2316ea17ca..7261e88c7e48d 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -19,17 +19,17 @@ def test_subset_column_selection(using_copy_on_write): if using_copy_on_write: # the subset shares memory ... - assert np.may_share_memory(subset["a"].values, df["a"].values) + assert np.shares_memory(subset["a"].values, df["a"].values) # ... but uses CoW when being modified subset.iloc[0, 0] = 0 else: - assert not np.may_share_memory(subset["a"].values, df["a"].values) + assert not np.shares_memory(subset["a"].values, df["a"].values) # INFO this no longer raise warning since pandas 1.4 # with pd.option_context("chained_assignment", "warn"): # with tm.assert_produces_warning(com.SettingWithCopyWarning): subset.iloc[0, 0] = 0 - assert not np.may_share_memory(subset["a"].values, df["a"].values) + assert not np.shares_memory(subset["a"].values, df["a"].values) expected = pd.DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(subset, expected) @@ -44,14 +44,14 @@ def test_subset_column_selection_modify_parent(using_copy_on_write): subset = df[["a", "c"]] if using_copy_on_write: # the subset shares memory ... - assert np.may_share_memory(subset["a"].values, df["a"].values) + assert np.shares_memory(subset["a"].values, df["a"].values) # ... but parent uses CoW parent when it is modified df.iloc[0, 0] = 0 - assert not np.may_share_memory(subset["a"].values, df["a"].values) + assert not np.shares_memory(subset["a"].values, df["a"].values) if using_copy_on_write: # different column/block still shares memory - assert np.may_share_memory(subset["c"].values, df["c"].values) + assert np.shares_memory(subset["c"].values, df["c"].values) expected = pd.DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(subset, expected) @@ -66,11 +66,11 @@ def test_subset_row_slice(using_copy_on_write): subset = df[1:3] subset._mgr._verify_integrity() - assert np.may_share_memory(subset["a"].values, df["a"].values) + assert np.shares_memory(subset["a"].values, df["a"].values) if using_copy_on_write: subset.iloc[0, 0] = 0 - assert not np.may_share_memory(subset["a"].values, df["a"].values) + assert not np.shares_memory(subset["a"].values, df["a"].values) else: # INFO this no longer raise warning since pandas 1.4 @@ -109,10 +109,10 @@ def test_subset_column_slice(using_copy_on_write, using_array_manager, dtype): subset._mgr._verify_integrity() if using_copy_on_write: - assert np.may_share_memory(subset["b"].values, df["b"].values) + assert np.shares_memory(subset["b"].values, df["b"].values) subset.iloc[0, 0] = 0 - assert not np.may_share_memory(subset["b"].values, df["b"].values) + assert not np.shares_memory(subset["b"].values, df["b"].values) else: # we only get a warning in case of a single block @@ -363,12 +363,12 @@ def test_series_getitem_slice(using_copy_on_write): s_orig = s.copy() subset = s[:] - assert np.may_share_memory(subset.values, s.values) + assert np.shares_memory(subset.values, s.values) subset.iloc[0] = 0 if using_copy_on_write: - assert not np.may_share_memory(subset.values, s.values) + assert not np.shares_memory(subset.values, s.values) expected = pd.Series([0, 2, 3], index=["a", "b", "c"]) tm.assert_series_equal(subset, expected) @@ -413,11 +413,11 @@ def test_del_frame(using_copy_on_write): df_orig = df.copy() df2 = df[:] - assert np.may_share_memory(df["a"].values, df2["a"].values) + assert np.shares_memory(df["a"].values, df2["a"].values) del df2["b"] - assert np.may_share_memory(df["a"].values, df2["a"].values) + assert np.shares_memory(df["a"].values, df2["a"].values) tm.assert_frame_equal(df, df_orig) tm.assert_frame_equal(df2, df_orig[["a", "c"]]) df2._mgr._verify_integrity() @@ -438,11 +438,11 @@ def test_del_series(): s_orig = s.copy() s2 = s[:] - assert np.may_share_memory(s.values, s2.values) + assert np.shares_memory(s.values, s2.values) del s2["a"] - assert not np.may_share_memory(s.values, s2.values) + assert not np.shares_memory(s.values, s2.values) tm.assert_series_equal(s, s_orig) tm.assert_series_equal(s2, s_orig[["b", "c"]]) @@ -463,7 +463,7 @@ def test_column_as_series(using_copy_on_write, using_array_manager): s = df["a"] - assert np.may_share_memory(s.values, df["a"].values) + assert np.shares_memory(s.values, df["a"].values) if using_copy_on_write or using_array_manager: s[0] = 0 @@ -475,7 +475,7 @@ def test_column_as_series(using_copy_on_write, using_array_manager): expected = pd.Series([0, 2, 3], name="a") tm.assert_series_equal(s, expected) if using_copy_on_write: - # assert not np.may_share_memory(s.values, df["a"].values) + # assert not np.shares_memory(s.values, df["a"].values) tm.assert_frame_equal(df, df_orig) # ensure cached series on getitem is not the changed series tm.assert_series_equal(df["a"], df_orig["a"]) @@ -522,7 +522,7 @@ def test_dataframe_add_column_from_series(): s = pd.Series([10, 11, 12]) df["new"] = s - assert not np.may_share_memory(df["new"].values, s.values) + assert not np.shares_memory(df["new"].values, s.values) # editing series -> doesn't modify column in frame s[0] = 0 diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 35efcc465b6b9..964c5dc1b7b28 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -9,7 +9,7 @@ def test_copy(using_copy_on_write): df_copy = df.copy() # the deep copy doesn't share memory - assert not np.may_share_memory(df_copy["a"].values, df["a"].values) + assert not np.shares_memory(df_copy["a"].values, df["a"].values) if using_copy_on_write: assert df_copy._mgr.refs is None # type: ignore[union-attr] @@ -23,7 +23,7 @@ def test_copy_shallow(using_copy_on_write): df_copy = df.copy(deep=False) # the shallow copy still shares memory - assert np.may_share_memory(df_copy["a"].values, df["a"].values) + assert np.shares_memory(df_copy["a"].values, df["a"].values) if using_copy_on_write: assert df_copy._mgr.refs is not None # type: ignore[union-attr] @@ -32,15 +32,15 @@ def test_copy_shallow(using_copy_on_write): df_copy.iloc[0, 0] = 0 assert df.iloc[0, 0] == 1 # mutating triggered a copy-on-write -> no longer shares memory - assert not np.may_share_memory(df_copy["a"].values, df["a"].values) + assert not np.shares_memory(df_copy["a"].values, df["a"].values) # but still shares memory for the other columns/blocks - assert np.may_share_memory(df_copy["c"].values, df["c"].values) + assert np.shares_memory(df_copy["c"].values, df["c"].values) else: # mutating shallow copy does mutate original df_copy.iloc[0, 0] = 0 assert df.iloc[0, 0] == 0 # and still shares memory - assert np.may_share_memory(df_copy["a"].values, df["a"].values) + assert np.shares_memory(df_copy["a"].values, df["a"].values) # ----------------------------------------------------------------------------- @@ -59,13 +59,13 @@ def test_reset_index(using_copy_on_write): if using_copy_on_write: # still shares memory (df2 is a shallow copy) - assert np.may_share_memory(df2["b"].values, df["b"].values) - assert np.may_share_memory(df2["c"].values, df["c"].values) + assert np.shares_memory(df2["b"].values, df["b"].values) + assert np.shares_memory(df2["c"].values, df["c"].values) # mutating df2 triggers a copy-on-write for that column / block df2.iloc[0, 2] = 0 - assert not np.may_share_memory(df2["b"].values, df["b"].values) + assert not np.shares_memory(df2["b"].values, df["b"].values) if using_copy_on_write: - assert np.may_share_memory(df2["c"].values, df["c"].values) + assert np.shares_memory(df2["c"].values, df["c"].values) tm.assert_frame_equal(df, df_orig) @@ -77,11 +77,11 @@ def test_rename_columns(using_copy_on_write): df2 = df.rename(columns=str.upper) if using_copy_on_write: - assert np.may_share_memory(df2["A"].values, df["a"].values) + assert np.shares_memory(df2["A"].values, df["a"].values) df2.iloc[0, 0] = 0 - assert not np.may_share_memory(df2["A"].values, df["a"].values) + assert not np.shares_memory(df2["A"].values, df["a"].values) if using_copy_on_write: - assert np.may_share_memory(df2["C"].values, df["c"].values) + assert np.shares_memory(df2["C"].values, df["c"].values) expected = pd.DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(df2, expected) tm.assert_frame_equal(df, df_orig) @@ -95,13 +95,13 @@ def test_rename_columns_modify_parent(using_copy_on_write): df2_orig = df2.copy() if using_copy_on_write: - assert np.may_share_memory(df2["A"].values, df["a"].values) + assert np.shares_memory(df2["A"].values, df["a"].values) else: - assert not np.may_share_memory(df2["A"].values, df["a"].values) + assert not np.shares_memory(df2["A"].values, df["a"].values) df.iloc[0, 0] = 0 - assert not np.may_share_memory(df2["A"].values, df["a"].values) + assert not np.shares_memory(df2["A"].values, df["a"].values) if using_copy_on_write: - assert np.may_share_memory(df2["C"].values, df["c"].values) + assert np.shares_memory(df2["C"].values, df["c"].values) expected = pd.DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df2, df2_orig) @@ -116,12 +116,12 @@ def test_reindex_columns(using_copy_on_write): if using_copy_on_write: # still shares memory (df2 is a shallow copy) - assert np.may_share_memory(df2["a"].values, df["a"].values) + assert np.shares_memory(df2["a"].values, df["a"].values) else: - assert not np.may_share_memory(df2["a"].values, df["a"].values) + assert not np.shares_memory(df2["a"].values, df["a"].values) # mutating df2 triggers a copy-on-write for that column df2.iloc[0, 0] = 0 - assert not np.may_share_memory(df2["a"].values, df["a"].values) + assert not np.shares_memory(df2["a"].values, df["a"].values) if using_copy_on_write: - assert np.may_share_memory(df2["c"].values, df["c"].values) + assert np.shares_memory(df2["c"].values, df["c"].values) tm.assert_frame_equal(df, df_orig) From 6e49fd0512de7d3c7adc761374a98c6017b24602 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 20 May 2022 11:09:55 +0200 Subject: [PATCH 08/13] update pandas imports --- pandas/tests/copy_view/test_indexing.py | 80 ++++++++++++------------- pandas/tests/copy_view/test_methods.py | 18 +++--- 2 files changed, 49 insertions(+), 49 deletions(-) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 7261e88c7e48d..7e32c80348267 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -2,6 +2,10 @@ import pytest import pandas as pd +from pandas import ( + DataFrame, + Series, +) import pandas._testing as tm import pandas.core.common as com @@ -12,7 +16,7 @@ def test_subset_column_selection(using_copy_on_write): # Case: taking a subset of the columns of a DataFrame # + afterwards modifying the subset - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() subset = df[["a", "c"]] @@ -31,7 +35,7 @@ def test_subset_column_selection(using_copy_on_write): assert not np.shares_memory(subset["a"].values, df["a"].values) - expected = pd.DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]}) + expected = DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(subset, expected) tm.assert_frame_equal(df, df_orig) @@ -39,7 +43,7 @@ def test_subset_column_selection(using_copy_on_write): def test_subset_column_selection_modify_parent(using_copy_on_write): # Case: taking a subset of the columns of a DataFrame # + afterwards modifying the parent - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) subset = df[["a", "c"]] if using_copy_on_write: @@ -53,14 +57,14 @@ def test_subset_column_selection_modify_parent(using_copy_on_write): # different column/block still shares memory assert np.shares_memory(subset["c"].values, df["c"].values) - expected = pd.DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]}) + expected = DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(subset, expected) def test_subset_row_slice(using_copy_on_write): # Case: taking a subset of the rows of a DataFrame using a slice # + afterwards modifying the subset - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() subset = df[1:3] @@ -80,9 +84,7 @@ def test_subset_row_slice(using_copy_on_write): subset._mgr._verify_integrity() - expected = pd.DataFrame( - {"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) - ) + expected = DataFrame({"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3)) tm.assert_frame_equal(subset, expected) if using_copy_on_write: # original parent dataframe is not modified (CoW) @@ -100,7 +102,7 @@ def test_subset_column_slice(using_copy_on_write, using_array_manager, dtype): # Case: taking a subset of the columns of a DataFrame using a slice # + afterwards modifying the subset single_block = (dtype == "int64") and not using_array_manager - df = pd.DataFrame( + df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} ) df_orig = df.copy() @@ -121,7 +123,7 @@ def test_subset_column_slice(using_copy_on_write, using_array_manager, dtype): with tm.assert_produces_warning(warn): subset.iloc[0, 0] = 0 - expected = pd.DataFrame({"b": [0, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}) + expected = DataFrame({"b": [0, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}) tm.assert_frame_equal(subset, expected) # original parent dataframe is not modified (also not for BlockManager case, # except for single block) @@ -140,7 +142,7 @@ def test_subset_column_slice(using_copy_on_write, using_array_manager, dtype): def test_subset_set_with_row_indexer(indexer_si, indexer, using_copy_on_write): # Case: setting values with a row indexer on a viewing subset # subset[indexer] = value and subset.iloc[indexer] = value - df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) + df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) df_orig = df.copy() subset = df[1:4] @@ -160,7 +162,7 @@ def test_subset_set_with_row_indexer(indexer_si, indexer, using_copy_on_write): with tm.assert_produces_warning(warn): indexer_si(subset)[indexer] = 0 - expected = pd.DataFrame( + expected = DataFrame( {"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4) ) tm.assert_frame_equal(subset, expected) @@ -175,7 +177,7 @@ def test_subset_set_with_row_indexer(indexer_si, indexer, using_copy_on_write): def test_subset_set_with_mask(using_copy_on_write): # Case: setting values with a mask on a viewing subset: subset[mask] = value - df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) + df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]}) df_orig = df.copy() subset = df[1:4] @@ -188,7 +190,7 @@ def test_subset_set_with_mask(using_copy_on_write): with tm.assert_produces_warning(com.SettingWithCopyWarning): subset[mask] = 0 - expected = pd.DataFrame( + expected = DataFrame( {"a": [2, 3, 0], "b": [0, 0, 0], "c": [0.20, 0.3, 0.4]}, index=range(1, 4) ) tm.assert_frame_equal(subset, expected) @@ -204,7 +206,7 @@ def test_subset_set_with_mask(using_copy_on_write): def test_subset_set_column(using_copy_on_write): # Case: setting a single column on a viewing subset -> subset[col] = value - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() subset = df[1:3] @@ -216,7 +218,7 @@ def test_subset_set_column(using_copy_on_write): subset["a"] = np.array([10, 11], dtype="int64") subset._mgr._verify_integrity() - expected = pd.DataFrame( + expected = DataFrame( {"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3) ) tm.assert_frame_equal(subset, expected) @@ -229,7 +231,7 @@ def test_subset_set_column(using_copy_on_write): def test_subset_set_column_with_loc(using_copy_on_write, using_array_manager, dtype): # Case: setting a single column with loc on a viewing subset # -> subset.loc[:, col] = value - df = pd.DataFrame( + df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} ) df_orig = df.copy() @@ -243,7 +245,7 @@ def test_subset_set_column_with_loc(using_copy_on_write, using_array_manager, dt subset.loc[:, "a"] = np.array([10, 11], dtype="int64") subset._mgr._verify_integrity() - expected = pd.DataFrame( + expected = DataFrame( {"a": [10, 11], "b": [5, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3), ) @@ -262,7 +264,7 @@ def test_subset_set_column_with_loc2(using_copy_on_write, using_array_manager): # -> subset.loc[:, col] = value # separate test for case of DataFrame of a single column -> takes a separate # code path - df = pd.DataFrame({"a": [1, 2, 3]}) + df = DataFrame({"a": [1, 2, 3]}) df_orig = df.copy() subset = df[1:3] @@ -274,7 +276,7 @@ def test_subset_set_column_with_loc2(using_copy_on_write, using_array_manager): subset.loc[:, "a"] = 0 subset._mgr._verify_integrity() - expected = pd.DataFrame({"a": [0, 0]}, index=range(1, 3)) + expected = DataFrame({"a": [0, 0]}, index=range(1, 3)) tm.assert_frame_equal(subset, expected) if using_copy_on_write or using_array_manager: # original parent dataframe is not modified (CoW) @@ -291,7 +293,7 @@ def test_subset_set_column_with_loc2(using_copy_on_write, using_array_manager): def test_subset_set_columns(using_copy_on_write, dtype): # Case: setting multiple columns on a viewing subset # -> subset[[col1, col2]] = value - df = pd.DataFrame( + df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} ) df_orig = df.copy() @@ -308,7 +310,7 @@ def test_subset_set_columns(using_copy_on_write, dtype): if using_copy_on_write: # first and third column should certainly have no references anymore assert all(subset._mgr._has_no_reference(i) for i in [0, 2]) - expected = pd.DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) + expected = DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3)) tm.assert_frame_equal(subset, expected) tm.assert_frame_equal(df, df_orig) @@ -323,7 +325,7 @@ def test_subset_set_with_column_indexer( ): # Case: setting multiple columns with a column indexer on a viewing subset # -> subset.loc[:, [col1, col2]] = value - df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]}) df_orig = df.copy() subset = df[1:3] @@ -335,9 +337,7 @@ def test_subset_set_with_column_indexer( subset.loc[:, indexer] = 0 subset._mgr._verify_integrity() - expected = pd.DataFrame( - {"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3) - ) + expected = DataFrame({"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3)) # TODO full row slice .loc[:, idx] update inplace instead of overwrite? expected["b"] = expected["b"].astype("int64") tm.assert_frame_equal(subset, expected) @@ -359,7 +359,7 @@ def test_subset_set_with_column_indexer( def test_series_getitem_slice(using_copy_on_write): # Case: taking a slice of a Series + afterwards modifying the subset - s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s = Series([1, 2, 3], index=["a", "b", "c"]) s_orig = s.copy() subset = s[:] @@ -370,7 +370,7 @@ def test_series_getitem_slice(using_copy_on_write): if using_copy_on_write: assert not np.shares_memory(subset.values, s.values) - expected = pd.Series([0, 2, 3], index=["a", "b", "c"]) + expected = Series([0, 2, 3], index=["a", "b", "c"]) tm.assert_series_equal(subset, expected) if using_copy_on_write: @@ -388,12 +388,12 @@ def test_series_getitem_slice(using_copy_on_write): ) def test_series_subset_set_with_indexer(indexer_si, indexer, using_copy_on_write): # Case: setting values in a viewing Series with an indexer - s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s = Series([1, 2, 3], index=["a", "b", "c"]) s_orig = s.copy() subset = s[:] indexer_si(subset)[indexer] = 0 - expected = pd.Series([0, 0, 3], index=["a", "b", "c"]) + expected = Series([0, 0, 3], index=["a", "b", "c"]) tm.assert_series_equal(subset, expected) if using_copy_on_write: @@ -409,7 +409,7 @@ def test_series_subset_set_with_indexer(indexer_si, indexer, using_copy_on_write def test_del_frame(using_copy_on_write): # Case: deleting a column with `del` on a viewing child dataframe should # not modify parent + update the references - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df[:] @@ -434,7 +434,7 @@ def test_del_frame(using_copy_on_write): def test_del_series(): - s = pd.Series([1, 2, 3], index=["a", "b", "c"]) + s = Series([1, 2, 3], index=["a", "b", "c"]) s_orig = s.copy() s2 = s[:] @@ -458,7 +458,7 @@ def test_del_series(): def test_column_as_series(using_copy_on_write, using_array_manager): # Case: selecting a single column now also uses Copy-on-Write - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() s = df["a"] @@ -472,7 +472,7 @@ def test_column_as_series(using_copy_on_write, using_array_manager): with tm.assert_produces_warning(com.SettingWithCopyWarning): s[0] = 0 - expected = pd.Series([0, 2, 3], name="a") + expected = Series([0, 2, 3], name="a") tm.assert_series_equal(s, expected) if using_copy_on_write: # assert not np.shares_memory(s.values, df["a"].values) @@ -488,7 +488,7 @@ def test_column_as_series_set_with_upcast(using_copy_on_write, using_array_manag # Case: selecting a single column now also uses Copy-on-Write -> when # setting a value causes an upcast, we don't need to update the parent # DataFrame through the cache mechanism - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() s = df["a"] @@ -499,7 +499,7 @@ def test_column_as_series_set_with_upcast(using_copy_on_write, using_array_manag with tm.assert_produces_warning(com.SettingWithCopyWarning): s[0] = "foo" - expected = pd.Series(["foo", 2, 3], dtype=object, name="a") + expected = Series(["foo", 2, 3], dtype=object, name="a") tm.assert_series_equal(s, expected) if using_copy_on_write: tm.assert_frame_equal(df, df_orig) @@ -518,20 +518,20 @@ def test_dataframe_add_column_from_series(): # -> always already takes a copy on assignment # (no change in behaviour here) # TODO can we achieve the same behaviour with Copy-on-Write? - df = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]}) - s = pd.Series([10, 11, 12]) + s = Series([10, 11, 12]) df["new"] = s assert not np.shares_memory(df["new"].values, s.values) # editing series -> doesn't modify column in frame s[0] = 0 - expected = pd.DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]}) + expected = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]}) tm.assert_frame_equal(df, expected) # editing column in frame -> doesn't modify series df.loc[2, "new"] = 100 - expected_s = pd.Series([0, 11, 12]) + expected_s = Series([0, 11, 12]) tm.assert_series_equal(s, expected_s) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 964c5dc1b7b28..a149b8db915d0 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -1,11 +1,11 @@ import numpy as np -import pandas as pd +from pandas import DataFrame import pandas._testing as tm def test_copy(using_copy_on_write): - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_copy = df.copy() # the deep copy doesn't share memory @@ -19,7 +19,7 @@ def test_copy(using_copy_on_write): def test_copy_shallow(using_copy_on_write): - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_copy = df.copy(deep=False) # the shallow copy still shares memory @@ -50,7 +50,7 @@ def test_copy_shallow(using_copy_on_write): def test_reset_index(using_copy_on_write): # Case: resetting the index (i.e. adding a new column) + mutating the # resulting dataframe - df = pd.DataFrame( + df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=[10, 11, 12] ) df_orig = df.copy() @@ -72,7 +72,7 @@ def test_reset_index(using_copy_on_write): def test_rename_columns(using_copy_on_write): # Case: renaming columns returns a new dataframe # + afterwards modifying the result - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df.rename(columns=str.upper) @@ -82,7 +82,7 @@ def test_rename_columns(using_copy_on_write): assert not np.shares_memory(df2["A"].values, df["a"].values) if using_copy_on_write: assert np.shares_memory(df2["C"].values, df["c"].values) - expected = pd.DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) + expected = DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(df2, expected) tm.assert_frame_equal(df, df_orig) @@ -90,7 +90,7 @@ def test_rename_columns(using_copy_on_write): def test_rename_columns_modify_parent(using_copy_on_write): # Case: renaming columns returns a new dataframe # + afterwards modifying the original (parent) dataframe - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df2 = df.rename(columns=str.upper) df2_orig = df2.copy() @@ -102,7 +102,7 @@ def test_rename_columns_modify_parent(using_copy_on_write): assert not np.shares_memory(df2["A"].values, df["a"].values) if using_copy_on_write: assert np.shares_memory(df2["C"].values, df["c"].values) - expected = pd.DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + expected = DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df2, df2_orig) @@ -110,7 +110,7 @@ def test_rename_columns_modify_parent(using_copy_on_write): def test_reindex_columns(using_copy_on_write): # Case: reindexing the column returns a new dataframe # + afterwards modifying the result - df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df.reindex(columns=["a", "c"]) From 768ec6fa528f5b8511c520ce345aa7ac4d07a468 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 May 2022 14:09:32 +0200 Subject: [PATCH 09/13] add test for iloc/low with row+column indexer --- pandas/tests/copy_view/test_indexing.py | 94 +++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 7e32c80348267..9157d9825c69a 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -134,6 +134,100 @@ def test_subset_column_slice(using_copy_on_write, using_array_manager, dtype): tm.assert_frame_equal(df, df_orig) +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +@pytest.mark.parametrize( + "row_indexer", + [slice(1, 2), np.array([False, True, True]), np.array([1, 2])], + ids=["slice", "mask", "array"], +) +@pytest.mark.parametrize( + "column_indexer", + [slice("b", "c"), np.array([False, True, True]), ["b", "c"]], + ids=["slice", "mask", "array"], +) +def test_subset_loc_rows_columns( + dtype, row_indexer, column_indexer, using_array_manager +): + # Case: taking a subset of the rows+columns of a DataFrame using .loc + # + afterwards modifying the subset + # Generic test for several combinations of row/column indexers, not all + # of those could actually return a view / need CoW (so this test is not + # checking memory sharing, only ensuring subsequent mutation doesn't + # affect the parent dataframe) + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + + subset = df.loc[row_indexer, column_indexer] + + # modifying the subset never modifies the parent + subset.iloc[0, 0] = 0 + + expected = DataFrame( + {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + # a few corner cases _do_ actually modify the parent (with both row and column + # slice, and in case of ArrayManager or BlockManager with single block) + if ( + isinstance(row_indexer, slice) + and isinstance(column_indexer, slice) + and (using_array_manager or dtype == "int64") + ): + df_orig.iloc[1, 1] = 0 + tm.assert_frame_equal(df, df_orig) + + +@pytest.mark.parametrize( + "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"] +) +@pytest.mark.parametrize( + "row_indexer", + [slice(1, 3), np.array([False, True, True]), np.array([1, 2])], + ids=["slice", "mask", "array"], +) +@pytest.mark.parametrize( + "column_indexer", + [slice(1, 3), np.array([False, True, True]), [1, 2]], + ids=["slice", "mask", "array"], +) +def test_subset_iloc_rows_columns( + dtype, row_indexer, column_indexer, using_array_manager +): + # Case: taking a subset of the rows+columns of a DataFrame using .iloc + # + afterwards modifying the subset + # Generic test for several combinations of row/column indexers, not all + # of those could actually return a view / need CoW (so this test is not + # checking memory sharing, only ensuring subsequent mutation doesn't + # affect the parent dataframe) + df = DataFrame( + {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)} + ) + df_orig = df.copy() + + subset = df.iloc[row_indexer, column_indexer] + + # modifying the subset never modifies the parent + subset.iloc[0, 0] = 0 + + expected = DataFrame( + {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3) + ) + tm.assert_frame_equal(subset, expected) + # a few corner cases _do_ actually modify the parent (with both row and column + # slice, and in case of ArrayManager or BlockManager with single block) + if ( + isinstance(row_indexer, slice) + and isinstance(column_indexer, slice) + and (using_array_manager or dtype == "int64") + ): + df_orig.iloc[1, 1] = 0 + tm.assert_frame_equal(df, df_orig) + + @pytest.mark.parametrize( "indexer", [slice(0, 2), np.array([True, True, False]), np.array([0, 1])], From 0d523ef6562b82f940bad228d41bc2ce078ffd42 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 May 2022 15:24:37 +0200 Subject: [PATCH 10/13] user helper to get column values to assert np.shares_memory --- pandas/tests/copy_view/__init__.py | 0 pandas/tests/copy_view/test_indexing.py | 31 ++++++++++--------- pandas/tests/copy_view/test_methods.py | 41 +++++++++++++------------ pandas/tests/copy_view/util.py | 11 +++++++ 4 files changed, 48 insertions(+), 35 deletions(-) create mode 100644 pandas/tests/copy_view/__init__.py create mode 100644 pandas/tests/copy_view/util.py diff --git a/pandas/tests/copy_view/__init__.py b/pandas/tests/copy_view/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 9157d9825c69a..98945e33a7494 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -8,6 +8,7 @@ ) import pandas._testing as tm import pandas.core.common as com +from pandas.tests.copy_view.util import get_array # ----------------------------------------------------------------------------- # Indexing operations taking subset + modifying the subset/parent @@ -23,17 +24,17 @@ def test_subset_column_selection(using_copy_on_write): if using_copy_on_write: # the subset shares memory ... - assert np.shares_memory(subset["a"].values, df["a"].values) + assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) # ... but uses CoW when being modified subset.iloc[0, 0] = 0 else: - assert not np.shares_memory(subset["a"].values, df["a"].values) + assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a")) # INFO this no longer raise warning since pandas 1.4 # with pd.option_context("chained_assignment", "warn"): # with tm.assert_produces_warning(com.SettingWithCopyWarning): subset.iloc[0, 0] = 0 - assert not np.shares_memory(subset["a"].values, df["a"].values) + assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a")) expected = DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(subset, expected) @@ -48,14 +49,14 @@ def test_subset_column_selection_modify_parent(using_copy_on_write): subset = df[["a", "c"]] if using_copy_on_write: # the subset shares memory ... - assert np.shares_memory(subset["a"].values, df["a"].values) + assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) # ... but parent uses CoW parent when it is modified df.iloc[0, 0] = 0 - assert not np.shares_memory(subset["a"].values, df["a"].values) + assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a")) if using_copy_on_write: # different column/block still shares memory - assert np.shares_memory(subset["c"].values, df["c"].values) + assert np.shares_memory(get_array(subset, "c"), get_array(df, "c")) expected = DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(subset, expected) @@ -70,11 +71,11 @@ def test_subset_row_slice(using_copy_on_write): subset = df[1:3] subset._mgr._verify_integrity() - assert np.shares_memory(subset["a"].values, df["a"].values) + assert np.shares_memory(get_array(subset, "a"), get_array(df, "a")) if using_copy_on_write: subset.iloc[0, 0] = 0 - assert not np.shares_memory(subset["a"].values, df["a"].values) + assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a")) else: # INFO this no longer raise warning since pandas 1.4 @@ -111,10 +112,10 @@ def test_subset_column_slice(using_copy_on_write, using_array_manager, dtype): subset._mgr._verify_integrity() if using_copy_on_write: - assert np.shares_memory(subset["b"].values, df["b"].values) + assert np.shares_memory(get_array(subset, "b"), get_array(df, "b")) subset.iloc[0, 0] = 0 - assert not np.shares_memory(subset["b"].values, df["b"].values) + assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b")) else: # we only get a warning in case of a single block @@ -507,11 +508,11 @@ def test_del_frame(using_copy_on_write): df_orig = df.copy() df2 = df[:] - assert np.shares_memory(df["a"].values, df2["a"].values) + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) del df2["b"] - assert np.shares_memory(df["a"].values, df2["a"].values) + assert np.shares_memory(get_array(df, "a"), get_array(df2, "a")) tm.assert_frame_equal(df, df_orig) tm.assert_frame_equal(df2, df_orig[["a", "c"]]) df2._mgr._verify_integrity() @@ -557,7 +558,7 @@ def test_column_as_series(using_copy_on_write, using_array_manager): s = df["a"] - assert np.shares_memory(s.values, df["a"].values) + assert np.shares_memory(s.values, get_array(df, "a")) if using_copy_on_write or using_array_manager: s[0] = 0 @@ -569,7 +570,7 @@ def test_column_as_series(using_copy_on_write, using_array_manager): expected = Series([0, 2, 3], name="a") tm.assert_series_equal(s, expected) if using_copy_on_write: - # assert not np.shares_memory(s.values, df["a"].values) + # assert not np.shares_memory(s.values, get_array(df, "a")) tm.assert_frame_equal(df, df_orig) # ensure cached series on getitem is not the changed series tm.assert_series_equal(df["a"], df_orig["a"]) @@ -616,7 +617,7 @@ def test_dataframe_add_column_from_series(): s = Series([10, 11, 12]) df["new"] = s - assert not np.shares_memory(df["new"].values, s.values) + assert not np.shares_memory(get_array(df, "new"), s.values) # editing series -> doesn't modify column in frame s[0] = 0 diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index a149b8db915d0..9783a3469bcfa 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -2,6 +2,7 @@ from pandas import DataFrame import pandas._testing as tm +from pandas.tests.copy_view.util import get_array def test_copy(using_copy_on_write): @@ -9,7 +10,7 @@ def test_copy(using_copy_on_write): df_copy = df.copy() # the deep copy doesn't share memory - assert not np.shares_memory(df_copy["a"].values, df["a"].values) + assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: assert df_copy._mgr.refs is None # type: ignore[union-attr] @@ -23,7 +24,7 @@ def test_copy_shallow(using_copy_on_write): df_copy = df.copy(deep=False) # the shallow copy still shares memory - assert np.shares_memory(df_copy["a"].values, df["a"].values) + assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: assert df_copy._mgr.refs is not None # type: ignore[union-attr] @@ -32,15 +33,15 @@ def test_copy_shallow(using_copy_on_write): df_copy.iloc[0, 0] = 0 assert df.iloc[0, 0] == 1 # mutating triggered a copy-on-write -> no longer shares memory - assert not np.shares_memory(df_copy["a"].values, df["a"].values) + assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) # but still shares memory for the other columns/blocks - assert np.shares_memory(df_copy["c"].values, df["c"].values) + assert np.shares_memory(get_array(df_copy, "c"), get_array(df, "c")) else: # mutating shallow copy does mutate original df_copy.iloc[0, 0] = 0 assert df.iloc[0, 0] == 0 # and still shares memory - assert np.shares_memory(df_copy["a"].values, df["a"].values) + assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) # ----------------------------------------------------------------------------- @@ -59,13 +60,13 @@ def test_reset_index(using_copy_on_write): if using_copy_on_write: # still shares memory (df2 is a shallow copy) - assert np.shares_memory(df2["b"].values, df["b"].values) - assert np.shares_memory(df2["c"].values, df["c"].values) + assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) # mutating df2 triggers a copy-on-write for that column / block df2.iloc[0, 2] = 0 - assert not np.shares_memory(df2["b"].values, df["b"].values) + assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) if using_copy_on_write: - assert np.shares_memory(df2["c"].values, df["c"].values) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) tm.assert_frame_equal(df, df_orig) @@ -77,11 +78,11 @@ def test_rename_columns(using_copy_on_write): df2 = df.rename(columns=str.upper) if using_copy_on_write: - assert np.shares_memory(df2["A"].values, df["a"].values) + assert np.shares_memory(get_array(df2, "A"), get_array(df, "a")) df2.iloc[0, 0] = 0 - assert not np.shares_memory(df2["A"].values, df["a"].values) + assert not np.shares_memory(get_array(df2, "A"), get_array(df, "a")) if using_copy_on_write: - assert np.shares_memory(df2["C"].values, df["c"].values) + assert np.shares_memory(get_array(df2, "C"), get_array(df, "c")) expected = DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(df2, expected) tm.assert_frame_equal(df, df_orig) @@ -95,13 +96,13 @@ def test_rename_columns_modify_parent(using_copy_on_write): df2_orig = df2.copy() if using_copy_on_write: - assert np.shares_memory(df2["A"].values, df["a"].values) + assert np.shares_memory(get_array(df2, "A"), get_array(df, "a")) else: - assert not np.shares_memory(df2["A"].values, df["a"].values) + assert not np.shares_memory(get_array(df2, "A"), get_array(df, "a")) df.iloc[0, 0] = 0 - assert not np.shares_memory(df2["A"].values, df["a"].values) + assert not np.shares_memory(get_array(df2, "A"), get_array(df, "a")) if using_copy_on_write: - assert np.shares_memory(df2["C"].values, df["c"].values) + assert np.shares_memory(get_array(df2, "C"), get_array(df, "c")) expected = DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df2, df2_orig) @@ -116,12 +117,12 @@ def test_reindex_columns(using_copy_on_write): if using_copy_on_write: # still shares memory (df2 is a shallow copy) - assert np.shares_memory(df2["a"].values, df["a"].values) + assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: - assert not np.shares_memory(df2["a"].values, df["a"].values) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column df2.iloc[0, 0] = 0 - assert not np.shares_memory(df2["a"].values, df["a"].values) + assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) if using_copy_on_write: - assert np.shares_memory(df2["c"].values, df["c"].values) + assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) tm.assert_frame_equal(df, df_orig) diff --git a/pandas/tests/copy_view/util.py b/pandas/tests/copy_view/util.py new file mode 100644 index 0000000000000..9e358c7eec749 --- /dev/null +++ b/pandas/tests/copy_view/util.py @@ -0,0 +1,11 @@ +def get_array(df, col): + """ + Helper method to get array for a DataFrame column. + + Equivalent of df[col].values, but without going through normal getitem, + which triggers tracking references / CoW (and we might be testing that + this is done by some other operation). + """ + icol = df.columns.get_loc(col) + assert isinstance(icol, int) + return df._get_column_array(icol) From c41ad6ae6c803ef9392b6cd63212a94ee7344e69 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 23 May 2022 19:17:22 +0200 Subject: [PATCH 11/13] fix typing --- pandas/tests/copy_view/test_methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/copy_view/test_methods.py b/pandas/tests/copy_view/test_methods.py index 9783a3469bcfa..1ed458e95b78e 100644 --- a/pandas/tests/copy_view/test_methods.py +++ b/pandas/tests/copy_view/test_methods.py @@ -12,7 +12,7 @@ def test_copy(using_copy_on_write): # the deep copy doesn't share memory assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: - assert df_copy._mgr.refs is None # type: ignore[union-attr] + assert df_copy._mgr.refs is None # mutating copy doesn't mutate original df_copy.iloc[0, 0] = 0 @@ -26,7 +26,7 @@ def test_copy_shallow(using_copy_on_write): # the shallow copy still shares memory assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: - assert df_copy._mgr.refs is not None # type: ignore[union-attr] + assert df_copy._mgr.refs is not None if using_copy_on_write: # mutating shallow copy doesn't mutate original From 1d4d476e7a9db6e142d0829619c7e259b1e18237 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 25 May 2022 19:01:19 +0200 Subject: [PATCH 12/13] handle new iloc inplace deprecation warnings --- pandas/tests/copy_view/test_indexing.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 98945e33a7494..3e8d0faca7ee4 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -428,7 +428,11 @@ def test_subset_set_with_column_indexer( subset.loc[:, indexer] = 0 else: with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(com.SettingWithCopyWarning): + # The (i)loc[:, col] inplace deprecation gets triggered here, ignore those + # warnings and only assert the SettingWithCopyWarning + with tm.assert_produces_warning( + com.SettingWithCopyWarning, raise_on_extra_warnings=False + ): subset.loc[:, indexer] = 0 subset._mgr._verify_integrity() From 4bf82b0c20ef8702ae9c4d7d9560a9fcfc88ce48 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 31 May 2022 03:12:53 +0200 Subject: [PATCH 13/13] handle new iloc deprecation warnings for ArrayManager --- pandas/tests/copy_view/test_indexing.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/tests/copy_view/test_indexing.py b/pandas/tests/copy_view/test_indexing.py index 3e8d0faca7ee4..16cd72cc1cb06 100644 --- a/pandas/tests/copy_view/test_indexing.py +++ b/pandas/tests/copy_view/test_indexing.py @@ -336,7 +336,13 @@ def test_subset_set_column_with_loc(using_copy_on_write, using_array_manager, dt subset.loc[:, "a"] = np.array([10, 11], dtype="int64") else: with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(com.SettingWithCopyWarning): + # The (i)loc[:, col] inplace deprecation gets triggered here, ignore those + # warnings and only assert the SettingWithCopyWarning + raise_on_extra_warnings = False if using_array_manager else True + with tm.assert_produces_warning( + com.SettingWithCopyWarning, + raise_on_extra_warnings=raise_on_extra_warnings, + ): subset.loc[:, "a"] = np.array([10, 11], dtype="int64") subset._mgr._verify_integrity() @@ -367,7 +373,13 @@ def test_subset_set_column_with_loc2(using_copy_on_write, using_array_manager): subset.loc[:, "a"] = 0 else: with pd.option_context("chained_assignment", "warn"): - with tm.assert_produces_warning(com.SettingWithCopyWarning): + # The (i)loc[:, col] inplace deprecation gets triggered here, ignore those + # warnings and only assert the SettingWithCopyWarning + raise_on_extra_warnings = False if using_array_manager else True + with tm.assert_produces_warning( + com.SettingWithCopyWarning, + raise_on_extra_warnings=raise_on_extra_warnings, + ): subset.loc[:, "a"] = 0 subset._mgr._verify_integrity()