[SPARK-44961][PYTHON][CONNECT][TESTS] Make PySpark (pyspark-connect m…

…odule) tests passing without any dependency ### What changes were proposed in this pull request? This PR proposes to fix the tests to properly run or skip when there aren't optional dependencies installed. ### Why are the changes needed? Currently, it fails as below: ``` ./python/run-tests --python-executables=python3 --modules=pyspark-connect ... 2c5289024a/python3__pyspark.sql.connect.window__nvbbzy7q.log) Finished test(python3): pyspark.sql.connect.session (0s) Traceback (most recent call last): File "/Users/hyukjin.kwon/workspace/forked/spark/python/pyspark/sql/pandas/utils.py", line 27, in require_minimum_pandas_version import pandas ModuleNotFoundError: No module named 'pandas' ``` PySpark tests should pass without optional dependencies. ### Does this PR introduce _any_ user-facing change? No, test-only. ### How was this patch tested? Manually ran as described above. ### Was this patch authored or co-authored using generative AI tooling? No. Closes apache#42676 from HyukjinKwon/spark-connect-test. Authored-by: Hyukjin Kwon <gurwls223@apache.org> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
ueshin · Aug 25, 2023 · 12f3c81 · 12f3c81
1 parent 632dd33
commit 12f3c81
Show file tree

Hide file tree

Showing 9 changed files with 26 additions and 24 deletions.
diff --git a/python/pyspark/sql/connect/conf.py b/python/pyspark/sql/connect/conf.py
@@ -14,6 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from pyspark.sql.connect.utils import check_dependencies
+
+check_dependencies(__name__)
+
 from typing import Any, Optional, Union, cast
 import warnings
 

diff --git a/python/pyspark/sql/connect/session.py b/python/pyspark/sql/connect/session.py
@@ -64,7 +64,8 @@
     CachedRemoteRelation,
 )
 from pyspark.sql.connect.readwriter import DataFrameReader
-from pyspark.sql.connect.streaming import DataStreamReader, StreamingQueryManager
+from pyspark.sql.connect.streaming.readwriter import DataStreamReader
+from pyspark.sql.connect.streaming.query import StreamingQueryManager
 from pyspark.sql.pandas.serializers import ArrowStreamPandasSerializer
 from pyspark.sql.pandas.types import to_arrow_schema, to_arrow_type, _deduplicate_field_names
 from pyspark.sql.session import classproperty, SparkSession as PySparkSession

diff --git a/python/pyspark/sql/connect/streaming/__init__.py b/python/pyspark/sql/connect/streaming/__init__.py
@@ -14,9 +14,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
-from pyspark.sql.connect.streaming.query import StreamingQuery  # noqa: F401
-from pyspark.sql.connect.streaming.readwriter import DataStreamReader  # noqa: F401
-from pyspark.sql.connect.streaming.readwriter import DataStreamWriter  # noqa: F401
-from pyspark.sql.connect.streaming.query import StreamingQueryManager  # noqa: F401
-from pyspark.errors import StreamingQueryException  # noqa: F401
diff --git a/python/pyspark/sql/connect/streaming/query.py b/python/pyspark/sql/connect/streaming/query.py
@@ -14,6 +14,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+from pyspark.sql.connect.utils import check_dependencies
+
+check_dependencies(__name__)
 
 import json
 import sys
@@ -35,8 +38,6 @@
 )
 from pyspark.errors import PySparkPicklingError
 
-__all__ = ["StreamingQuery", "StreamingQueryManager"]
-
 if TYPE_CHECKING:
     from pyspark.sql.connect.session import SparkSession
 

diff --git a/python/pyspark/sql/connect/streaming/readwriter.py b/python/pyspark/sql/connect/streaming/readwriter.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
 from pyspark.sql.connect.utils import check_dependencies
 
 check_dependencies(__name__)
@@ -42,8 +41,6 @@
     from pyspark.sql.connect.dataframe import DataFrame
     from pyspark.sql._typing import SupportsProcess
 
-__all__ = ["DataStreamReader", "DataStreamWriter"]
-
 
 class DataStreamReader(OptionUtils):
     def __init__(self, client: "SparkSession") -> None:

diff --git a/python/pyspark/sql/tests/connect/client/test_client.py b/python/pyspark/sql/tests/connect/client/test_client.py
@@ -19,16 +19,15 @@
 import uuid
 from typing import Optional
 
-from pyspark.sql.connect.client import SparkConnectClient, ChannelBuilder
-import pyspark.sql.connect.proto as proto
 from pyspark.testing.connectutils import should_test_connect, connect_requirement_message
 
-from pyspark.sql.connect.client.core import Retrying
-from pyspark.sql.connect.client.reattach import RetryException
-
 if should_test_connect:
     import pandas as pd
     import pyarrow as pa
+    from pyspark.sql.connect.client import SparkConnectClient, ChannelBuilder
+    from pyspark.sql.connect.client.core import Retrying
+    from pyspark.sql.connect.client.reattach import RetryException
+    import pyspark.sql.connect.proto as proto
 
 
 @unittest.skipIf(not should_test_connect, connect_requirement_message)

diff --git a/python/pyspark/sql/tests/connect/test_parity_functions.py b/python/pyspark/sql/tests/connect/test_parity_functions.py
@@ -17,10 +17,12 @@
 
 import unittest
 
-from pyspark.errors.exceptions.connect import SparkConnectException
-from pyspark.sql.connect.column import Column
 from pyspark.sql.tests.test_functions import FunctionsTestsMixin
-from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.connectutils import should_test_connect, ReusedConnectTestCase
+
+if should_test_connect:
+    from pyspark.errors.exceptions.connect import SparkConnectException
+    from pyspark.sql.connect.column import Column
 
 
 class FunctionsParityTests(FunctionsTestsMixin, ReusedConnectTestCase):

diff --git a/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py b/python/pyspark/sql/tests/connect/test_parity_pandas_udf.py
@@ -14,10 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-from pyspark.sql.connect.types import UnparsedDataType
 from pyspark.sql.functions import pandas_udf, PandasUDFType
 from pyspark.sql.tests.pandas.test_pandas_udf import PandasUDFTestsMixin
-from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.connectutils import should_test_connect, ReusedConnectTestCase
+
+if should_test_connect:
+    from pyspark.sql.connect.types import UnparsedDataType
 
 
 class PandasUDFParityTests(PandasUDFTestsMixin, ReusedConnectTestCase):

diff --git a/python/pyspark/sql/tests/connect/test_parity_readwriter.py b/python/pyspark/sql/tests/connect/test_parity_readwriter.py
@@ -16,9 +16,11 @@
 #
 import unittest
 
-from pyspark.sql.connect.readwriter import DataFrameWriterV2
 from pyspark.sql.tests.test_readwriter import ReadwriterTestsMixin, ReadwriterV2TestsMixin
-from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.connectutils import should_test_connect, ReusedConnectTestCase
+
+if should_test_connect:
+    from pyspark.sql.connect.readwriter import DataFrameWriterV2
 
 
 class ReadwriterParityTests(ReadwriterTestsMixin, ReusedConnectTestCase):