Skip to content

Commit

Permalink
[PYSPARK] Re-balance pyspark test
Browse files Browse the repository at this point in the history
  • Loading branch information
panbingkun committed Jun 28, 2024
1 parent 8cd095f commit 06e90fb
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 24 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/build_and_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -385,6 +385,8 @@ jobs:
pyspark-pandas-connect-part2
- >-
pyspark-pandas-connect-part3
- >-
pyspark-pandas-connect-part4
exclude:
# Always run if pyspark-pandas == 'true', even infra-image is skip (such as non-master job)
# In practice, the build will run in individual PR, but not against the individual commit
Expand All @@ -395,6 +397,7 @@ jobs:
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part1' }}
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part2' }}
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part3' }}
- modules: ${{ fromJson(needs.precondition.outputs.required).pyspark-pandas != 'true' && 'pyspark-pandas-connect-part4' }}
env:
MODULES_TO_TEST: ${{ matrix.modules }}
PYTHON_TO_TEST: 'python3.11'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build_python_connect.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ jobs:
# Several tests related to catalog requires to run them sequencially, e.g., writing a table in a listener.
./python/run-tests --parallelism=1 --python-executables=python3 --modules pyspark-connect,pyspark-ml-connect
# None of tests are dependent on each other in Pandas API on Spark so run them in parallel
./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3
./python/run-tests --parallelism=4 --python-executables=python3 --modules pyspark-pandas-connect-part0,pyspark-pandas-connect-part1,pyspark-pandas-connect-part2,pyspark-pandas-connect-part3,pyspark-pandas-connect-part4
# Stop Spark Connect server.
./sbin/stop-connect-server.sh
Expand Down
48 changes: 32 additions & 16 deletions dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -1317,17 +1317,6 @@ def __hash__(self):
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_series",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_setitem_frame",
"pyspark.pandas.tests.connect.diff_frames_ops.test_parity_setitem_series",
"pyspark.pandas.tests.connect.groupby.test_parity_index",
"pyspark.pandas.tests.connect.groupby.test_parity_describe",
"pyspark.pandas.tests.connect.groupby.test_parity_head_tail",
"pyspark.pandas.tests.connect.groupby.test_parity_groupby",
"pyspark.pandas.tests.connect.groupby.test_parity_grouping",
"pyspark.pandas.tests.connect.groupby.test_parity_missing",
"pyspark.pandas.tests.connect.groupby.test_parity_nlargest_nsmallest",
"pyspark.pandas.tests.connect.groupby.test_parity_raises",
"pyspark.pandas.tests.connect.groupby.test_parity_rank",
"pyspark.pandas.tests.connect.groupby.test_parity_size",
"pyspark.pandas.tests.connect.groupby.test_parity_value_counts",
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
Expand All @@ -1351,11 +1340,6 @@ def __hash__(self):
"pyspark.pandas.tests.connect.io.test_parity_dataframe_conversion",
"pyspark.pandas.tests.connect.io.test_parity_dataframe_spark_io",
"pyspark.pandas.tests.connect.io.test_parity_series_conversion",
"pyspark.pandas.tests.connect.groupby.test_parity_stat",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_adv",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_ddof",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_func",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_prod",
"pyspark.pandas.tests.connect.indexes.test_parity_append",
"pyspark.pandas.tests.connect.indexes.test_parity_intersection",
"pyspark.pandas.tests.connect.indexes.test_parity_monotonic",
Expand Down Expand Up @@ -1405,6 +1389,38 @@ def __hash__(self):
)


pyspark_pandas_connect_part4 = Module(
name="pyspark-pandas-connect-part4",
dependencies=[pyspark_connect, pyspark_pandas, pyspark_pandas_slow],
source_file_regexes=[
"python/pyspark/pandas",
],
python_test_goals=[
# pandas-on-Spark unittests
"pyspark.pandas.tests.connect.groupby.test_parity_index",
"pyspark.pandas.tests.connect.groupby.test_parity_describe",
"pyspark.pandas.tests.connect.groupby.test_parity_head_tail",
"pyspark.pandas.tests.connect.groupby.test_parity_groupby",
"pyspark.pandas.tests.connect.groupby.test_parity_grouping",
"pyspark.pandas.tests.connect.groupby.test_parity_missing",
"pyspark.pandas.tests.connect.groupby.test_parity_nlargest_nsmallest",
"pyspark.pandas.tests.connect.groupby.test_parity_raises",
"pyspark.pandas.tests.connect.groupby.test_parity_rank",
"pyspark.pandas.tests.connect.groupby.test_parity_size",
"pyspark.pandas.tests.connect.groupby.test_parity_value_counts",
"pyspark.pandas.tests.connect.groupby.test_parity_stat",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_adv",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_ddof",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_func",
"pyspark.pandas.tests.connect.groupby.test_parity_stat_prod",
],
excluded_python_implementations=[
"PyPy" # Skip these tests under PyPy since they require numpy, pandas, and pyarrow and
# they aren't available there
],
)


pyspark_errors = Module(
name="pyspark-errors",
dependencies=[],
Expand Down
14 changes: 7 additions & 7 deletions dev/sparktestsupport/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,26 +111,26 @@ def determine_modules_to_test(changed_modules, deduplicated=True):
['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver',
'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib',
'pyspark-pandas', 'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1',
'pyspark-pandas-connect-part2', 'pyspark-pandas-connect-part3', 'pyspark-pandas-slow',
'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
'pyspark-pandas-connect-part2', 'pyspark-pandas-connect-part3', 'pyspark-pandas-connect-part4',
'pyspark-pandas-slow', 'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
>>> sorted([x.name for x in determine_modules_to_test(
... [modules.sparkr, modules.sql], deduplicated=False)])
... # doctest: +NORMALIZE_WHITESPACE
['avro', 'connect', 'docker-integration-tests', 'examples', 'hive', 'hive-thriftserver',
'mllib', 'protobuf', 'pyspark-connect', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib',
'pyspark-pandas', 'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1',
'pyspark-pandas-connect-part2', 'pyspark-pandas-connect-part3', 'pyspark-pandas-slow',
'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
'pyspark-pandas-connect-part2', 'pyspark-pandas-connect-part3', 'pyspark-pandas-connect-part4',
'pyspark-pandas-slow', 'pyspark-sql', 'pyspark-testing', 'repl', 'sparkr', 'sql', 'sql-kafka-0-10']
>>> sorted([x.name for x in determine_modules_to_test(
... [modules.sql, modules.core], deduplicated=False)])
... # doctest: +NORMALIZE_WHITESPACE
['avro', 'catalyst', 'connect', 'core', 'docker-integration-tests', 'examples', 'graphx',
'hive', 'hive-thriftserver', 'mllib', 'mllib-local', 'protobuf', 'pyspark-connect',
'pyspark-core', 'pyspark-ml', 'pyspark-ml-connect', 'pyspark-mllib', 'pyspark-pandas',
'pyspark-pandas-connect-part0', 'pyspark-pandas-connect-part1', 'pyspark-pandas-connect-part2',
'pyspark-pandas-connect-part3', 'pyspark-pandas-slow', 'pyspark-resource', 'pyspark-sql',
'pyspark-streaming', 'pyspark-testing', 'repl', 'root', 'sparkr', 'sql', 'sql-kafka-0-10',
'streaming', 'streaming-kafka-0-10', 'streaming-kinesis-asl']
'pyspark-pandas-connect-part3', 'pyspark-pandas-connect-part4', 'pyspark-pandas-slow',
'pyspark-resource', 'pyspark-sql', 'pyspark-streaming', 'pyspark-testing', 'repl', 'root',
'sparkr', 'sql', 'sql-kafka-0-10', 'streaming', 'streaming-kafka-0-10', 'streaming-kinesis-asl']
"""
modules_to_test = set()
for module in changed_modules:
Expand Down

0 comments on commit 06e90fb

Please sign in to comment.