[SPARK-44051][TESTS][PS][CONNECT] Split `pyspark.pandas.tests.connect…

….data_type_ops.test_parity_num_ops` ### What changes were proposed in this pull request? Split `pyspark.pandas.tests.connect.data_type_ops.test_parity_num_ops` ### Why are the changes needed? it is slow ``` Starting test(python3.9): pyspark.pandas.tests.connect.data_type_ops.test_parity_num_ops (temp output: /__w/spark/spark/python/target/fd59d461-ba78-4672-a164-5d0790c57fb0/python3.9__pyspark.pandas.tests.connect.data_type_ops.test_parity_num_ops__gsu_twd5.log) Finished test(python3.9): pyspark.pandas.tests.connect.data_type_ops.test_parity_num_ops (333s) ... 1 tests were skipped ``` ### Does this PR introduce _any_ user-facing change? no, test-only ### How was this patch tested? updated CI Closes apache#41591 from zhengruifeng/ps_test_split_num_ops. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
czxm · Jun 19, 2023 · a10bf33 · a10bf33
1 parent 8ed8136
commit a10bf33
Show file tree

Hide file tree

Showing 6 changed files with 414 additions and 209 deletions.
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -668,6 +668,8 @@ def __hash__(self):
         "pyspark.pandas.tests.data_type_ops.test_datetime_ops",
         "pyspark.pandas.tests.data_type_ops.test_null_ops",
         "pyspark.pandas.tests.data_type_ops.test_num_ops",
+        "pyspark.pandas.tests.data_type_ops.test_num_arithmetic",
+        "pyspark.pandas.tests.data_type_ops.test_num_reverse",
         "pyspark.pandas.tests.data_type_ops.test_string_ops",
         "pyspark.pandas.tests.data_type_ops.test_udt_ops",
         "pyspark.pandas.tests.data_type_ops.test_timedelta_ops",
@@ -885,6 +887,8 @@ def __hash__(self):
         "pyspark.pandas.tests.connect.data_type_ops.test_parity_datetime_ops",
         "pyspark.pandas.tests.connect.data_type_ops.test_parity_null_ops",
         "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_ops",
+        "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_arithmetic",
+        "pyspark.pandas.tests.connect.data_type_ops.test_parity_num_reverse",
         "pyspark.pandas.tests.connect.data_type_ops.test_parity_string_ops",
         "pyspark.pandas.tests.connect.data_type_ops.test_parity_udt_ops",
         "pyspark.pandas.tests.connect.data_type_ops.test_parity_timedelta_ops",

diff --git a/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py b/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_arithmetic.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark import pandas as ps
+from pyspark.pandas.tests.data_type_ops.test_num_arithmetic import ArithmeticTestsMixin
+from pyspark.pandas.tests.connect.data_type_ops.testing_utils import OpsTestBase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+from pyspark.testing.connectutils import ReusedConnectTestCase
+
+
+class ArithmeticParityTests(
+    ArithmeticTestsMixin, PandasOnSparkTestUtils, OpsTestBase, ReusedConnectTestCase
+):
+    @property
+    def psdf(self):
+        return ps.from_pandas(self.pdf)
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.connect.data_type_ops.test_parity_num_arithmetic import *  # noqa
+
+    try:
+        import xmlrunner  # type: ignore[import]
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py b/python/pyspark/pandas/tests/connect/data_type_ops/test_parity_num_reverse.py
@@ -0,0 +1,43 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark import pandas as ps
+from pyspark.pandas.tests.data_type_ops.test_num_reverse import ReverseTestsMixin
+from pyspark.pandas.tests.connect.data_type_ops.testing_utils import OpsTestBase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+from pyspark.testing.connectutils import ReusedConnectTestCase
+
+
+class ReverseParityTests(
+    ReverseTestsMixin, PandasOnSparkTestUtils, OpsTestBase, ReusedConnectTestCase
+):
+    @property
+    def psdf(self):
+        return ps.from_pandas(self.pdf)
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.connect.data_type_ops.test_parity_num_reverse import *  # noqa: F401
+
+    try:
+        import xmlrunner  # type: ignore[import]
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py b/python/pyspark/pandas/tests/data_type_ops/test_num_arithmetic.py
@@ -0,0 +1,184 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import datetime
+import unittest
+
+import pandas as pd
+import numpy as np
+
+from pyspark import pandas as ps
+from pyspark.pandas.tests.data_type_ops.testing_utils import OpsTestBase
+
+
+class ArithmeticTestsMixin:
+    """Unit tests for arithmetic operations of numeric data types.
+
+    A few test cases are disabled because pandas-on-Spark returns float64 whereas pandas
+    returns float32.
+    The underlying reason is the respective Spark operations return DoubleType always.
+    """
+
+    @property
+    def float_pser(self):
+        return pd.Series([1, 2, 3], dtype=float)
+
+    @property
+    def float_psser(self):
+        return ps.from_pandas(self.float_pser)
+
+    def test_add(self):
+        pdf, psdf = self.pdf, self.psdf
+        for col in self.numeric_df_cols:
+            pser, psser = pdf[col], psdf[col]
+            self.assert_eq(pser + pser, psser + psser)
+            self.assert_eq(pser + 1, psser + 1)
+            # self.assert_eq(pser + 0.1, psser + 0.1)
+            self.assert_eq(pser + pser.astype(bool), psser + psser.astype(bool))
+            self.assert_eq(pser + True, psser + True)
+            self.assert_eq(pser + False, psser + False)
+
+            for n_col in self.non_numeric_df_cols:
+                if n_col == "bool":
+                    self.assert_eq(pser + pdf[n_col], psser + psdf[n_col])
+                else:
+                    self.assertRaises(TypeError, lambda: psser + psdf[n_col])
+
+    def test_sub(self):
+        pdf, psdf = self.pdf, self.psdf
+        for col in self.numeric_df_cols:
+            pser, psser = pdf[col], psdf[col]
+            self.assert_eq(pser - pser, psser - psser)
+            self.assert_eq(pser - 1, psser - 1)
+            # self.assert_eq(pser - 0.1, psser - 0.1)
+            self.assert_eq(pser - pser.astype(bool), psser - psser.astype(bool))
+            self.assert_eq(pser - True, psser - True)
+            self.assert_eq(pser - False, psser - False)
+
+            for n_col in self.non_numeric_df_cols:
+                if n_col == "bool":
+                    self.assert_eq(pser - pdf[n_col], psser - psdf[n_col])
+                else:
+                    self.assertRaises(TypeError, lambda: psser - psdf[n_col])
+
+    def test_mul(self):
+        pdf, psdf = self.pdf, self.psdf
+        for col in self.numeric_df_cols:
+            pser, psser = pdf[col], psdf[col]
+            self.assert_eq(pser * pser, psser * psser)
+            self.assert_eq(pser * pser.astype(bool), psser * psser.astype(bool))
+            self.assert_eq(pser * True, psser * True)
+            self.assert_eq(pser * False, psser * False)
+
+            if psser.dtype in [int, np.int32]:
+                self.assert_eq(pser * pdf["string"], psser * psdf["string"])
+            else:
+                self.assertRaises(TypeError, lambda: psser * psdf["string"])
+
+            self.assert_eq(pser * pdf["bool"], psser * psdf["bool"])
+
+            self.assertRaises(TypeError, lambda: psser * psdf["datetime"])
+            self.assertRaises(TypeError, lambda: psser * psdf["date"])
+            self.assertRaises(TypeError, lambda: psser * psdf["categorical"])
+
+    def test_truediv(self):
+        pdf, psdf = self.pdf, self.psdf
+        for col in self.numeric_df_cols:
+            pser, psser = pdf[col], psdf[col]
+            if psser.dtype in [float, int, np.int32]:
+                self.assert_eq(pser / pser, psser / psser)
+                self.assert_eq(pser / pser.astype(bool), psser / psser.astype(bool))
+                self.assert_eq(pser / True, psser / True)
+                self.assert_eq(pser / False, psser / False)
+
+            for n_col in self.non_numeric_df_cols:
+                if n_col == "bool":
+                    self.assert_eq(pdf["float"] / pdf[n_col], psdf["float"] / psdf[n_col])
+                else:
+                    self.assertRaises(TypeError, lambda: psser / psdf[n_col])
+
+    def test_floordiv(self):
+        pdf, psdf = self.pdf, self.psdf
+        pser, psser = pdf["float"], psdf["float"]
+        self.assert_eq(pser // pser, psser // psser)
+        self.assert_eq(pser // pser.astype(bool), psser // psser.astype(bool))
+        self.assert_eq(pser // True, psser // True)
+        self.assert_eq(pser // False, psser // False)
+
+        for n_col in self.non_numeric_df_cols:
+            if n_col == "bool":
+                self.assert_eq(pdf["float"] // pdf["bool"], psdf["float"] // psdf["bool"])
+            else:
+                for col in self.numeric_df_cols:
+                    psser = psdf[col]
+                    self.assertRaises(TypeError, lambda: psser // psdf[n_col])
+
+    def test_mod(self):
+        pdf, psdf = self.pdf, self.psdf
+        for col in self.numeric_df_cols:
+            pser, psser = pdf[col], psdf[col]
+            self.assert_eq(pser % pser, psser % psser)
+            self.assert_eq(pser % pser.astype(bool), psser % psser.astype(bool))
+            self.assert_eq(pser % True, psser % True)
+            if col in ["int", "int32"]:
+                self.assert_eq(
+                    pd.Series([np.nan, np.nan, np.nan], dtype=float, name=col), psser % False
+                )
+            else:
+                self.assert_eq(
+                    pd.Series([np.nan, np.nan, np.nan], dtype=pser.dtype, name=col), psser % False
+                )
+
+            for n_col in self.non_numeric_df_cols:
+                if n_col == "bool":
+                    self.assert_eq(pdf["float"] % pdf[n_col], psdf["float"] % psdf[n_col])
+                else:
+                    self.assertRaises(TypeError, lambda: psser % psdf[n_col])
+
+    def test_pow(self):
+        pdf, psdf = self.pdf, self.psdf
+        for col in self.numeric_df_cols:
+            pser, psser = pdf[col], psdf[col]
+            if col in ["float", "float_w_nan"]:
+                self.assert_eq(pser**pser, psser**psser)
+                self.assert_eq(pser ** pser.astype(bool), psser ** psser.astype(bool))
+                self.assert_eq(pser**True, psser**True)
+                self.assert_eq(pser**False, psser**False)
+                self.assert_eq(pser**1, psser**1)
+                self.assert_eq(pser**0, psser**0)
+
+            for n_col in self.non_numeric_df_cols:
+                if n_col == "bool":
+                    self.assert_eq(pdf["float"] ** pdf[n_col], psdf["float"] ** psdf[n_col])
+                else:
+                    self.assertRaises(TypeError, lambda: psser ** psdf[n_col])
+
+
+class ArithmeticTests(ArithmeticTestsMixin, OpsTestBase):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.data_type_ops.test_num_arithmetic import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)