From 8e46e4e47a11d37cb7b2b49079307b5a49c57488 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 11 Mar 2024 16:31:25 +0100 Subject: [PATCH 1/7] TEST-#7049: Add some sanity tests with pyarrow-backed pandas dataframes Signed-off-by: Anatoly Myachev --- modin/core/dataframe/algebra/binary.py | 1 + modin/tests/pandas/test_series.py | 88 ++++++++++++++++++++++++++ 2 files changed, 89 insertions(+) diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index b2ffc2e6788..75474bb3f08 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -250,6 +250,7 @@ def try_compute_new_dtypes( try: if infer_dtypes == "bool" or is_bool_dtype(result_dtype): + # can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data dtypes = maybe_build_dtypes_series( first, second, dtype=pandas.api.types.pandas_dtype(bool) ) diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index b079ce586dd..3530a2268d9 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1387,6 +1387,94 @@ def test_constructor_arrow_extension_array(): df_equals(md_ser.dtypes, pd_ser.dtypes) +def test_pyarrow_constructor(): + pa = pytest.importorskip("pyarrow") + data = list("abcd") + _ = pd.Series(data, dtype="string[pyarrow]") + _ = pd.Series(data, dtype=pd.ArrowDtype(pa.string())) + + list_str_type = pa.list_(pa.string()) + _ = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type)) + + from datetime import time + + _ = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us"))) + + from decimal import Decimal + + decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2)) + + data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]] + + _ = pd.DataFrame(data, dtype=decimal_type) + + +def test_pyarrow_array_retrieve(): + pa = pytest.importorskip("pyarrow") + modin_series, pandas_series = create_test_series( + [1, 2, None], dtype="uint8[pyarrow]" + ) + eval_general( + modin_series, + pandas_series, + lambda ser: pa.array(ser), + raising_exceptions=(Exception,), + ) + + +def test_pyarrow_functions(): + pytest.importorskip("pyarrow") + modin_series, pandas_series = create_test_series( + [-1.545, 0.211, None], dtype="float32[pyarrow]" + ) + df_equals(modin_series.mean(), pandas_series.mean()) + + def comparator(df1, df2): + df_equals(df1, df2) + df_equals(df1.dtypes, df2.dtypes) + + eval_general( + modin_series, + pandas_series, + lambda ser: ser + + (modin_series if isinstance(ser, pd.Series) else pandas_series), + comparator=comparator, + raising_exceptions=(Exception,), + ) + + eval_general( + modin_series, + pandas_series, + lambda ser: ser > (ser + 1), + comparator=comparator, + raising_exceptions=(Exception,), + ) + + eval_general( + modin_series, + pandas_series, + lambda ser: ser.dropna(), + comparator=comparator, + raising_exceptions=(Exception,), + ) + + eval_general( + modin_series, + pandas_series, + lambda ser: ser.isna(), + comparator=comparator, + raising_exceptions=(Exception,), + ) + + eval_general( + modin_series, + pandas_series, + lambda ser: ser.fillna(0), + comparator=comparator, + raising_exceptions=(Exception,), + ) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_copy(data): modin_series, pandas_series = create_test_series(data) From 6814c6eddee2b1004d8b8f24495e8c410cb16c7c Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 2 Apr 2024 15:59:51 +0200 Subject: [PATCH 2/7] fixes Signed-off-by: Anatoly Myachev --- modin/tests/pandas/test_series.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index 3530a2268d9..36844772ada 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1418,7 +1418,6 @@ def test_pyarrow_array_retrieve(): modin_series, pandas_series, lambda ser: pa.array(ser), - raising_exceptions=(Exception,), ) @@ -1439,7 +1438,6 @@ def comparator(df1, df2): lambda ser: ser + (modin_series if isinstance(ser, pd.Series) else pandas_series), comparator=comparator, - raising_exceptions=(Exception,), ) eval_general( @@ -1447,7 +1445,6 @@ def comparator(df1, df2): pandas_series, lambda ser: ser > (ser + 1), comparator=comparator, - raising_exceptions=(Exception,), ) eval_general( @@ -1455,7 +1452,6 @@ def comparator(df1, df2): pandas_series, lambda ser: ser.dropna(), comparator=comparator, - raising_exceptions=(Exception,), ) eval_general( @@ -1463,7 +1459,6 @@ def comparator(df1, df2): pandas_series, lambda ser: ser.isna(), comparator=comparator, - raising_exceptions=(Exception,), ) eval_general( @@ -1471,7 +1466,6 @@ def comparator(df1, df2): pandas_series, lambda ser: ser.fillna(0), comparator=comparator, - raising_exceptions=(Exception,), ) From e1dbc69f0572a0db2fc41d37bd574ab786326672 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 19 Apr 2024 12:50:02 +0200 Subject: [PATCH 3/7] fix Signed-off-by: Anatoly Myachev --- modin/core/dataframe/algebra/binary.py | 2 +- modin/tests/pandas/test_series.py | 12 ------------ 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index 75474bb3f08..a5fc89dd573 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -250,7 +250,7 @@ def try_compute_new_dtypes( try: if infer_dtypes == "bool" or is_bool_dtype(result_dtype): - # can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data + # FIXME: can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data dtypes = maybe_build_dtypes_series( first, second, dtype=pandas.api.types.pandas_dtype(bool) ) diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index 36844772ada..8b4ec960de2 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1409,18 +1409,6 @@ def test_pyarrow_constructor(): _ = pd.DataFrame(data, dtype=decimal_type) -def test_pyarrow_array_retrieve(): - pa = pytest.importorskip("pyarrow") - modin_series, pandas_series = create_test_series( - [1, 2, None], dtype="uint8[pyarrow]" - ) - eval_general( - modin_series, - pandas_series, - lambda ser: pa.array(ser), - ) - - def test_pyarrow_functions(): pytest.importorskip("pyarrow") modin_series, pandas_series = create_test_series( From 7b925a50c5f85ff8df8deae41b76295b56946beb Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 19 Apr 2024 13:15:33 +0200 Subject: [PATCH 4/7] cleanup Signed-off-by: Anatoly Myachev --- .../pandas/dataframe/test_map_metadata.py | 9 ++++++ modin/tests/pandas/test_series.py | 32 +++++++------------ 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/modin/tests/pandas/dataframe/test_map_metadata.py b/modin/tests/pandas/dataframe/test_map_metadata.py index b6dc1686ff8..ab7a7fa4a31 100644 --- a/modin/tests/pandas/dataframe/test_map_metadata.py +++ b/modin/tests/pandas/dataframe/test_map_metadata.py @@ -11,6 +11,8 @@ # ANY KIND, either express or implied. See the License for the specific language # governing permissions and limitations under the License. +from decimal import Decimal + import matplotlib import numpy as np import pandas @@ -1797,6 +1799,13 @@ def test_constructor(data): df_equals(pandas_df, modin_df) +def test_pyarrow_constructor(): + pa = pytest.importorskip("pyarrow") + + data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]] + df_equals(*create_test_dfs(data, dtype=pd.ArrowDtype(pa.decimal128(3, scale=2)))) + + @pytest.mark.parametrize( "data", [ diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index 0d17823bf61..e5ffad9a7ee 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1390,23 +1390,12 @@ def test_constructor_arrow_extension_array(): def test_pyarrow_constructor(): pa = pytest.importorskip("pyarrow") data = list("abcd") - _ = pd.Series(data, dtype="string[pyarrow]") - _ = pd.Series(data, dtype=pd.ArrowDtype(pa.string())) + df_equals(*create_test_series(data, dtype="string[pyarrow]")) + df_equals(*create_test_series(data, dtype=pd.ArrowDtype(pa.string()))) + data = [["hello"], ["there"]] list_str_type = pa.list_(pa.string()) - _ = pd.Series([["hello"], ["there"]], dtype=pd.ArrowDtype(list_str_type)) - - from datetime import time - - _ = pd.Index([time(12, 30), None], dtype=pd.ArrowDtype(pa.time64("us"))) - - from decimal import Decimal - - decimal_type = pd.ArrowDtype(pa.decimal128(3, scale=2)) - - data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]] - - _ = pd.DataFrame(data, dtype=decimal_type) + df_equals(*create_test_series(data, dtype=pd.ArrowDtype(list_str_type))) def test_pyarrow_functions(): @@ -1428,12 +1417,13 @@ def comparator(df1, df2): comparator=comparator, ) - eval_general( - modin_series, - pandas_series, - lambda ser: ser > (ser + 1), - comparator=comparator, - ) + # FIXME: https://github.com/modin-project/modin/issues/7203 + # eval_general( + # modin_series, + # pandas_series, + # lambda ser: ser > (ser + 1), + # comparator=comparator, + # ) eval_general( modin_series, From 23003c580487971ae5ec82d8b9a7bfb58825f76b Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 19 Apr 2024 13:18:01 +0200 Subject: [PATCH 5/7] fix comment Signed-off-by: Anatoly Myachev --- modin/core/dataframe/algebra/binary.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index a5fc89dd573..6af31ab826c 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -250,7 +250,8 @@ def try_compute_new_dtypes( try: if infer_dtypes == "bool" or is_bool_dtype(result_dtype): - # FIXME: can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data + # FIXME: https://github.com/modin-project/modin/issues/7203 + # can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data dtypes = maybe_build_dtypes_series( first, second, dtype=pandas.api.types.pandas_dtype(bool) ) From cc2a5ab8a4c14a2bd736cc390b4dc15b1003e328 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 19 Apr 2024 14:06:55 +0200 Subject: [PATCH 6/7] skip some cases for HDK Signed-off-by: Anatoly Myachev --- modin/tests/pandas/test_series.py | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index e5ffad9a7ee..fe3e3bc2e33 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1409,13 +1409,15 @@ def comparator(df1, df2): df_equals(df1, df2) df_equals(df1.dtypes, df2.dtypes) - eval_general( - modin_series, - pandas_series, - lambda ser: ser - + (modin_series if isinstance(ser, pd.Series) else pandas_series), - comparator=comparator, - ) + if StorageFormat.get() != "Hdk": + # FIXME: HDK should also work in this case + eval_general( + modin_series, + pandas_series, + lambda ser: ser + + (modin_series if isinstance(ser, pd.Series) else pandas_series), + comparator=comparator, + ) # FIXME: https://github.com/modin-project/modin/issues/7203 # eval_general( @@ -1439,12 +1441,14 @@ def comparator(df1, df2): comparator=comparator, ) - eval_general( - modin_series, - pandas_series, - lambda ser: ser.fillna(0), - comparator=comparator, - ) + if StorageFormat.get() != "Hdk": + # FIXME: HDK should also work in this case + eval_general( + modin_series, + pandas_series, + lambda ser: ser.fillna(0), + comparator=comparator, + ) def test_pyarrow_array_retrieve(): From c3cc95ad411cb64c26b2225c483c8cae3c83cbff Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 22 Apr 2024 18:54:46 +0200 Subject: [PATCH 7/7] Apply suggestions from code review Co-authored-by: Iaroslav Igoshev --- modin/tests/pandas/test_series.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modin/tests/pandas/test_series.py b/modin/tests/pandas/test_series.py index fe3e3bc2e33..152dd9bb013 100644 --- a/modin/tests/pandas/test_series.py +++ b/modin/tests/pandas/test_series.py @@ -1387,7 +1387,7 @@ def test_constructor_arrow_extension_array(): df_equals(md_ser.dtypes, pd_ser.dtypes) -def test_pyarrow_constructor(): +def test_pyarrow_backed_constructor(): pa = pytest.importorskip("pyarrow") data = list("abcd") df_equals(*create_test_series(data, dtype="string[pyarrow]")) @@ -1398,7 +1398,7 @@ def test_pyarrow_constructor(): df_equals(*create_test_series(data, dtype=pd.ArrowDtype(list_str_type))) -def test_pyarrow_functions(): +def test_pyarrow_backed_functions(): pytest.importorskip("pyarrow") modin_series, pandas_series = create_test_series( [-1.545, 0.211, None], dtype="float32[pyarrow]"