Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEST-#7049: Add some sanity tests with pyarrow-backed pandas dataframes #7199

Merged
merged 8 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions modin/core/dataframe/algebra/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,8 @@ def try_compute_new_dtypes(

try:
if infer_dtypes == "bool" or is_bool_dtype(result_dtype):
# FIXME: https://github.com/modin-project/modin/issues/7203
# can be `pandas.api.types.pandas_dtype("bool[pyarrow]")` depending on the data
dtypes = maybe_build_dtypes_series(
first, second, dtype=pandas.api.types.pandas_dtype(bool)
)
Expand Down
9 changes: 9 additions & 0 deletions modin/tests/pandas/dataframe/test_map_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

from decimal import Decimal

import matplotlib
import numpy as np
import pandas
Expand Down Expand Up @@ -1797,6 +1799,13 @@ def test_constructor(data):
df_equals(pandas_df, modin_df)


def test_pyarrow_constructor():
pa = pytest.importorskip("pyarrow")

data = [[Decimal("3.19"), None], [None, Decimal("-1.23")]]
df_equals(*create_test_dfs(data, dtype=pd.ArrowDtype(pa.decimal128(3, scale=2))))
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

df_equals function is used specifically since there is no benefit to using eval_general function (because the results of the constructors are tested).



@pytest.mark.parametrize(
"data",
[
Expand Down
64 changes: 64 additions & 0 deletions modin/tests/pandas/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1387,6 +1387,70 @@ def test_constructor_arrow_extension_array():
df_equals(md_ser.dtypes, pd_ser.dtypes)


def test_pyarrow_backed_constructor():
pa = pytest.importorskip("pyarrow")
data = list("abcd")
df_equals(*create_test_series(data, dtype="string[pyarrow]"))
df_equals(*create_test_series(data, dtype=pd.ArrowDtype(pa.string())))

data = [["hello"], ["there"]]
list_str_type = pa.list_(pa.string())
df_equals(*create_test_series(data, dtype=pd.ArrowDtype(list_str_type)))


def test_pyarrow_backed_functions():
pytest.importorskip("pyarrow")
modin_series, pandas_series = create_test_series(
[-1.545, 0.211, None], dtype="float32[pyarrow]"
)
df_equals(modin_series.mean(), pandas_series.mean())
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

eval_general?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mean() returns a floating point number, so I don't see the need for it


def comparator(df1, df2):
df_equals(df1, df2)
df_equals(df1.dtypes, df2.dtypes)

if StorageFormat.get() != "Hdk":
# FIXME: HDK should also work in this case
eval_general(
modin_series,
pandas_series,
lambda ser: ser
+ (modin_series if isinstance(ser, pd.Series) else pandas_series),
comparator=comparator,
)

# FIXME: https://github.com/modin-project/modin/issues/7203
# eval_general(
# modin_series,
# pandas_series,
# lambda ser: ser > (ser + 1),
# comparator=comparator,
# )

eval_general(
modin_series,
pandas_series,
lambda ser: ser.dropna(),
comparator=comparator,
)

eval_general(
modin_series,
pandas_series,
lambda ser: ser.isna(),
comparator=comparator,
)

if StorageFormat.get() != "Hdk":
# FIXME: HDK should also work in this case
eval_general(
modin_series,
pandas_series,
lambda ser: ser.fillna(0),
comparator=comparator,
)


def test_pyarrow_array_retrieve():
pa = pytest.importorskip("pyarrow")
modin_series, pandas_series = create_test_series(
Expand Down
Loading