Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-48561][PS][CONNECT] Throw PandasNotImplementedError for unsupported plotting functions #46911

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions dev/sparktestsupport/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -1102,6 +1102,8 @@ def __hash__(self):
"python/pyspark/pandas",
],
python_test_goals=[
# unittests dedicated for Spark Connect
"pyspark.pandas.tests.connect.test_connect_plotting",
# pandas-on-Spark unittests
"pyspark.pandas.tests.connect.test_parity_categorical",
"pyspark.pandas.tests.connect.test_parity_config",
Expand Down
13 changes: 12 additions & 1 deletion python/pyspark/pandas/plot/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from pandas.core.dtypes.inference import is_integer

from pyspark.sql import functions as F
from pyspark.sql.utils import is_remote
from pyspark.pandas.missing import unsupported_function
from pyspark.pandas.config import get_option
from pyspark.pandas.utils import name_like_string
Expand Down Expand Up @@ -571,10 +572,14 @@ def _get_plot_backend(backend=None):
return module

def __call__(self, kind="line", backend=None, **kwargs):
kind = {"density": "kde"}.get(kind, kind)

if is_remote() and kind in ["hist", "kde"]:
return unsupported_function(class_name="pd.DataFrame", method_name=kind)()

plot_backend = PandasOnSparkPlotAccessor._get_plot_backend(backend)
plot_data = self.data

kind = {"density": "kde"}.get(kind, kind)
if hasattr(plot_backend, "plot_pandas_on_spark"):
# use if there's pandas-on-Spark specific method.
return plot_backend.plot_pandas_on_spark(plot_data, kind=kind, **kwargs)
Expand Down Expand Up @@ -948,6 +953,9 @@ def hist(self, bins=10, **kwds):
>>> df = ps.from_pandas(df)
>>> df.plot.hist(bins=12, alpha=0.5) # doctest: +SKIP
"""
if is_remote():
return unsupported_function(class_name="pd.DataFrame", method_name="hist")()

return self(kind="hist", bins=bins, **kwds)

def kde(self, bw_method=None, ind=None, **kwargs):
Expand Down Expand Up @@ -1023,6 +1031,9 @@ def kde(self, bw_method=None, ind=None, **kwargs):
... })
>>> df.plot.kde(ind=[1, 2, 3, 4, 5, 6], bw_method=0.3) # doctest: +SKIP
"""
if is_remote():
return unsupported_function(class_name="pd.DataFrame", method_name="kde")()

return self(kind="kde", bw_method=bw_method, ind=ind, **kwargs)

density = kde
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
class SeriesPlotMatplotlibParityTests(
SeriesPlotMatplotlibTestsMixin, PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase
):
@unittest.skip("Test depends on Spark ML which is not supported from Spark Connect.")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it failed with "Empty 'DataFrame': no numeric data to plot" before, now fails with PandasNotImplementedError

def test_empty_hist(self):
super().test_empty_hist()

@unittest.skip("Test depends on Spark ML which is not supported from Spark Connect.")
def test_hist(self):
super().test_hist()
Expand Down
124 changes: 124 additions & 0 deletions python/pyspark/pandas/tests/connect/test_connect_plotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest

import pandas as pd

from pyspark import pandas as ps
from pyspark.pandas.exceptions import PandasNotImplementedError
from pyspark.testing.connectutils import ReusedConnectTestCase
from pyspark.testing.pandasutils import PandasOnSparkTestUtils, TestUtils


class ConnectPlottingTests(PandasOnSparkTestUtils, TestUtils, ReusedConnectTestCase):
@property
def pdf1(self):
return pd.DataFrame(
[[1, 2], [4, 5], [7, 8]],
index=["cobra", "viper", None],
columns=["max_speed", "shield"],
)

@property
def psdf1(self):
return ps.from_pandas(self.pdf1)

def test_unsupported_functions(self):
with self.assertRaises(PandasNotImplementedError):
self.psdf1.plot.hist()

with self.assertRaises(PandasNotImplementedError):
self.psdf1.plot.hist(bins=3)

with self.assertRaises(PandasNotImplementedError):
self.psdf1.plot.kde()

with self.assertRaises(PandasNotImplementedError):
self.psdf1.plot.kde(bw_method=3)

with self.assertRaises(PandasNotImplementedError):
self.psdf1.plot.density()

with self.assertRaises(PandasNotImplementedError):
self.psdf1.plot.density(bw_method=3)

with self.assertRaises(PandasNotImplementedError):
self.psdf1.shield.plot.hist()

with self.assertRaises(PandasNotImplementedError):
self.psdf1.shield.plot.hist(bins=3)

with self.assertRaises(PandasNotImplementedError):
self.psdf1.shield.plot.kde()

with self.assertRaises(PandasNotImplementedError):
self.psdf1.shield.plot.kde(bw_method=3)

with self.assertRaises(PandasNotImplementedError):
self.psdf1.shield.plot.density()

with self.assertRaises(PandasNotImplementedError):
self.psdf1.shield.plot.density(bw_method=3)

def test_unsupported_kinds(self):
with self.assertRaises(PandasNotImplementedError):
self.psdf1.plot(kind="hist")

with self.assertRaises(PandasNotImplementedError):
self.psdf1.plot(kind="hist", bins=3)

with self.assertRaises(PandasNotImplementedError):
self.psdf1.plot(kind="kde")

with self.assertRaises(PandasNotImplementedError):
self.psdf1.plot(kind="kde", bw_method=3)

with self.assertRaises(PandasNotImplementedError):
self.psdf1.plot(kind="density")

with self.assertRaises(PandasNotImplementedError):
self.psdf1.plot(kind="density", bw_method=3)

with self.assertRaises(PandasNotImplementedError):
self.psdf1.shield.plot(kind="hist")

with self.assertRaises(PandasNotImplementedError):
self.psdf1.shield.plot(kind="hist", bins=3)

with self.assertRaises(PandasNotImplementedError):
self.psdf1.shield.plot(kind="kde")

with self.assertRaises(PandasNotImplementedError):
self.psdf1.shield.plot(kind="kde", bw_method=3)

with self.assertRaises(PandasNotImplementedError):
self.psdf1.shield.plot(kind="density")

with self.assertRaises(PandasNotImplementedError):
self.psdf1.shield.plot(kind="density", bw_method=3)


if __name__ == "__main__":
from pyspark.pandas.tests.connect.test_connect_plotting import * # noqa: F401

try:
import xmlrunner # type: ignore[import]

testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
except ImportError:
testRunner = None
unittest.main(testRunner=testRunner, verbosity=2)