Skip to content

Commit

Permalink
TEST-modin-project#1961: speed up TestDataFrameReduction_A test
Browse files Browse the repository at this point in the history
Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
  • Loading branch information
anmyachev committed Aug 27, 2020
1 parent 81cf80c commit 833c1c2
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 174 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
with:
python-version: "3.7.x"
architecture: "x64"
- run: pip install black
- run: pip install "black==19.10b0"
- run: black --check --diff modin/

lint-flake8:
Expand Down
208 changes: 38 additions & 170 deletions modin/pandas/test/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@
udf_func_values,
udf_func_keys,
generate_multiindex,
test_bool_data,
)

pd.DEFAULT_NPARTITIONS = 4
Expand Down Expand Up @@ -3005,190 +3006,57 @@ def test_hasattr_sparse(self, data):

class TestDataFrameReduction_A:
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("axis", axis_values, ids=axis_keys)
@pytest.mark.parametrize(
"skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys)
)
@pytest.mark.parametrize(
"bool_only", bool_arg_values, ids=arg_keys("bool_only", bool_arg_keys)
)
def test_all(self, data, axis, skipna, bool_only):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)

try:
pandas_result = pandas_df.all(axis=axis, skipna=skipna, bool_only=bool_only)
except Exception as e:
with pytest.raises(type(e)):
modin_df.all(axis=axis, skipna=skipna, bool_only=bool_only)
else:
modin_result = modin_df.all(axis=axis, skipna=skipna, bool_only=bool_only)
df_equals(modin_result, pandas_result)

# Test when axis is None. This will get repeated but easier than using list in parameterize decorator
try:
pandas_result = pandas_df.all(axis=None, skipna=skipna, bool_only=bool_only)
except Exception as e:
with pytest.raises(type(e)):
modin_df.all(axis=None, skipna=skipna, bool_only=bool_only)
else:
modin_result = modin_df.all(axis=None, skipna=skipna, bool_only=bool_only)
df_equals(modin_result, pandas_result)

try:
pandas_result = pandas_df.T.all(
axis=axis, skipna=skipna, bool_only=bool_only
)
except Exception as e:
with pytest.raises(type(e)):
modin_df.T.all(axis=axis, skipna=skipna, bool_only=bool_only)
else:
modin_result = modin_df.T.all(axis=axis, skipna=skipna, bool_only=bool_only)
df_equals(modin_result, pandas_result)

# Test when axis is None. This will get repeated but easier than using list in parameterize decorator
try:
pandas_result = pandas_df.T.all(
axis=None, skipna=skipna, bool_only=bool_only
)
except Exception as e:
with pytest.raises(type(e)):
modin_df.T.all(axis=None, skipna=skipna, bool_only=bool_only)
else:
modin_result = modin_df.T.all(axis=None, skipna=skipna, bool_only=bool_only)
df_equals(modin_result, pandas_result)

# test level
modin_df_multi_level = modin_df.copy()
pandas_df_multi_level = pandas_df.copy()
axis = modin_df._get_axis_number(axis) if axis is not None else 0
levels = 3
axis_names_list = [["a", "b", "c"], None]
for axis_names in axis_names_list:
if axis == 0:
new_idx = pandas.MultiIndex.from_tuples(
[(i // 4, i // 2, i) for i in range(len(modin_df.index))],
names=axis_names,
)
modin_df_multi_level.index = new_idx
pandas_df_multi_level.index = new_idx
else:
new_col = pandas.MultiIndex.from_tuples(
[(i // 4, i // 2, i) for i in range(len(modin_df.columns))],
names=axis_names,
)
modin_df_multi_level.columns = new_col
pandas_df_multi_level.columns = new_col

for level in list(range(levels)) + (axis_names if axis_names else []):
try:
pandas_multi_level_result = pandas_df_multi_level.all(
axis=axis, bool_only=bool_only, level=level, skipna=skipna
)

except Exception as e:
with pytest.raises(type(e)):
modin_df_multi_level.all(
axis=axis, bool_only=bool_only, level=level, skipna=skipna
)
else:
modin_multi_level_result = modin_df_multi_level.all(
axis=axis, bool_only=bool_only, level=level, skipna=skipna
)
@pytest.mark.parametrize("axis", [None, 0, 1])
@pytest.mark.parametrize("method", ["all", "any"])
def test_all_any_default(self, data, axis, method):
modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)

df_equals(modin_multi_level_result, pandas_multi_level_result)
eval_general(
modin_df,
pandas_df,
lambda df: getattr(df, method)(axis=axis, skipna=True, bool_only=None),
)

@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("axis", axis_values, ids=axis_keys)
@pytest.mark.parametrize(
"skipna", bool_arg_values, ids=arg_keys("skipna", bool_arg_keys)
)
@pytest.mark.parametrize(
"bool_only", bool_arg_values, ids=arg_keys("bool_only", bool_arg_keys)
)
def test_any(self, data, axis, skipna, bool_only):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
@pytest.mark.parametrize("method", ["all", "any"])
@pytest.mark.parametrize("transpose", [False, True])
@pytest.mark.parametrize("data", [test_data["dense_nan_data"], test_bool_data])
def test_all_any(self, data, skipna, bool_only, method, transpose):
modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)

try:
pandas_result = pandas_df.any(axis=axis, skipna=skipna, bool_only=bool_only)
except Exception as e:
with pytest.raises(type(e)):
modin_df.any(axis=axis, skipna=skipna, bool_only=bool_only)
else:
modin_result = modin_df.any(axis=axis, skipna=skipna, bool_only=bool_only)
df_equals(modin_result, pandas_result)
if transpose:
modin_df, pandas_df = modin_df.T, pandas_df.T

try:
pandas_result = pandas_df.any(axis=None, skipna=skipna, bool_only=bool_only)
except Exception as e:
with pytest.raises(type(e)):
modin_df.any(axis=None, skipna=skipna, bool_only=bool_only)
else:
modin_result = modin_df.any(axis=None, skipna=skipna, bool_only=bool_only)
df_equals(modin_result, pandas_result)
eval_general(
modin_df,
pandas_df,
lambda df: getattr(df, method)(skipna=skipna, bool_only=bool_only),
)

try:
pandas_result = pandas_df.T.any(
axis=axis, skipna=skipna, bool_only=bool_only
)
except Exception as e:
with pytest.raises(type(e)):
modin_df.T.any(axis=axis, skipna=skipna, bool_only=bool_only)
else:
modin_result = modin_df.T.any(axis=axis, skipna=skipna, bool_only=bool_only)
df_equals(modin_result, pandas_result)
@pytest.mark.parametrize("axis", [0, 1])
@pytest.mark.parametrize("level", [-1, 0, 1])
@pytest.mark.parametrize("method", ["all", "any"])
def test_all_level(self, axis, level, method):
data = test_data_values[0]
modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)

try:
pandas_result = pandas_df.T.any(
axis=None, skipna=skipna, bool_only=bool_only
)
except Exception as e:
with pytest.raises(type(e)):
modin_df.T.any(axis=None, skipna=skipna, bool_only=bool_only)
if axis == 0:
new_idx = generate_multiindex(len(modin_df.index))
modin_df.index = new_idx
pandas_df.index = new_idx
else:
modin_result = modin_df.T.any(axis=None, skipna=skipna, bool_only=bool_only)
df_equals(modin_result, pandas_result)
new_col = generate_multiindex(len(modin_df.columns))
modin_df.columns = new_col
pandas_df.columns = new_col

# test level
modin_df_multi_level = modin_df.copy()
pandas_df_multi_level = pandas_df.copy()
axis = modin_df._get_axis_number(axis) if axis is not None else 0
levels = 3
axis_names_list = [["a", "b", "c"], None]
for axis_names in axis_names_list:
if axis == 0:
new_idx = pandas.MultiIndex.from_tuples(
[(i // 4, i // 2, i) for i in range(len(modin_df.index))],
names=axis_names,
)
modin_df_multi_level.index = new_idx
pandas_df_multi_level.index = new_idx
else:
new_col = pandas.MultiIndex.from_tuples(
[(i // 4, i // 2, i) for i in range(len(modin_df.columns))],
names=axis_names,
)
modin_df_multi_level.columns = new_col
pandas_df_multi_level.columns = new_col

for level in list(range(levels)) + (axis_names if axis_names else []):
try:
pandas_multi_level_result = pandas_df_multi_level.any(
axis=axis, bool_only=bool_only, level=level, skipna=skipna
)

except Exception as e:
with pytest.raises(type(e)):
modin_df_multi_level.any(
axis=axis, bool_only=bool_only, level=level, skipna=skipna
)
else:
modin_multi_level_result = modin_df_multi_level.any(
axis=axis, bool_only=bool_only, level=level, skipna=skipna
)

df_equals(modin_multi_level_result, pandas_multi_level_result)
eval_general(
modin_df, pandas_df, lambda df: getattr(df, method)(axis=axis, level=level),
)


class TestDataFrameReduction_B:
Expand Down
13 changes: 10 additions & 3 deletions modin/pandas/test/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,13 @@
test_data_values = list(test_data.values())
test_data_keys = list(test_data.keys())

test_bool_data = {
"col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): random_state.choice(
[True, False], size=(NROWS)
)
for i in range(NCOLS)
}

test_data_with_duplicates = {
"no_duplicates": {
"col{}".format(int((i - NCOLS / 2) % NCOLS + 1)): range(NROWS)
Expand Down Expand Up @@ -681,10 +688,10 @@ def generate_multiindex(index):
return df1, df2


def generate_multiindex(cols_number):
def generate_multiindex(elements_number):
arrays = [
random_state.choice(["bar", "baz", "foo", "qux"], cols_number),
random_state.choice(["one", "two"], cols_number),
random_state.choice(["bar", "baz", "foo", "qux"], elements_number),
random_state.choice(["one", "two"], elements_number),
]
return pd.MultiIndex.from_tuples(list(zip(*arrays)), names=["first", "second"])

Expand Down

0 comments on commit 833c1c2

Please sign in to comment.