From 13b5b57843bb601d87cc7dae103e982d0214e4ee Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Mon, 26 Aug 2024 15:29:51 +0900 Subject: [PATCH] [SPARK-49387][PYTHON] Fix type hint for `accuracy` in `percentile_approx` and `approx_percentile` ### What changes were proposed in this pull request? Fix type hint for `accuracy` in `percentile_approx` and `approx_percentile` ### Why are the changes needed? float `accuracy` is not supported: ``` In [9]: df.select(approx_percentile("value", [0.25, 0.5, 0.75], 1.1).alias("quantiles")).show() ... AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "approx_percentile(value, array(0.25, 0.5, 0.75), 1.1)" due to data type mismatch: The third parameter requires the "INTEGRAL" type, however "1.1" has the type "DOUBLE". SQLSTATE: 42K09; ``` ### Does this PR introduce _any_ user-facing change? yes, minor doc change ### How was this patch tested? CI ### Was this patch authored or co-authored using generative AI tooling? No Closes #47869 from zhengruifeng/py_approx_percentile_acc. Authored-by: Ruifeng Zheng Signed-off-by: Hyukjin Kwon --- python/pyspark/sql/connect/functions/builtin.py | 4 ++-- python/pyspark/sql/functions/builtin.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyspark/sql/connect/functions/builtin.py b/python/pyspark/sql/connect/functions/builtin.py index e28c9815a83c9..db3680f5cd42f 100644 --- a/python/pyspark/sql/connect/functions/builtin.py +++ b/python/pyspark/sql/connect/functions/builtin.py @@ -1223,7 +1223,7 @@ def percentile( def percentile_approx( col: "ColumnOrName", percentage: Union[Column, float, Sequence[float], Tuple[float]], - accuracy: Union[Column, float] = 10000, + accuracy: Union[Column, int] = 10000, ) -> Column: percentage = lit(list(percentage)) if isinstance(percentage, (list, tuple)) else lit(percentage) return _invoke_function_over_columns("percentile_approx", col, percentage, lit(accuracy)) @@ -1235,7 +1235,7 @@ def percentile_approx( def approx_percentile( col: "ColumnOrName", percentage: Union[Column, float, Sequence[float], Tuple[float]], - accuracy: Union[Column, float] = 10000, + accuracy: Union[Column, int] = 10000, ) -> Column: percentage = lit(list(percentage)) if isinstance(percentage, (list, tuple)) else lit(percentage) return _invoke_function_over_columns("approx_percentile", col, percentage, lit(accuracy)) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index 387a039758f1e..bae80c59c5785 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -6339,7 +6339,7 @@ def percentile( def percentile_approx( col: "ColumnOrName", percentage: Union[Column, float, Sequence[float], Tuple[float]], - accuracy: Union[Column, float] = 10000, + accuracy: Union[Column, int] = 10000, ) -> Column: """Returns the approximate `percentile` of the numeric column `col` which is the smallest value in the ordered `col` values (sorted from least to greatest) such that no more than `percentage` @@ -6360,7 +6360,7 @@ def percentile_approx( When percentage is an array, each value of the percentage array must be between 0.0 and 1.0. In this case, returns the approximate percentile array of column col at the given percentage array. - accuracy : :class:`~pyspark.sql.Column` or float + accuracy : :class:`~pyspark.sql.Column` or int is a positive numeric literal which controls approximation accuracy at the cost of memory. Higher value of accuracy yields better accuracy, 1.0/accuracy is the relative error of the approximation. (default: 10000). @@ -6397,7 +6397,7 @@ def percentile_approx( def approx_percentile( col: "ColumnOrName", percentage: Union[Column, float, Sequence[float], Tuple[float]], - accuracy: Union[Column, float] = 10000, + accuracy: Union[Column, int] = 10000, ) -> Column: """Returns the approximate `percentile` of the numeric column `col` which is the smallest value in the ordered `col` values (sorted from least to greatest) such that no more than `percentage` @@ -6414,7 +6414,7 @@ def approx_percentile( When percentage is an array, each value of the percentage array must be between 0.0 and 1.0. In this case, returns the approximate percentile array of column col at the given percentage array. - accuracy : :class:`~pyspark.sql.Column` or float + accuracy : :class:`~pyspark.sql.Column` or int is a positive numeric literal which controls approximation accuracy at the cost of memory. Higher value of accuracy yields better accuracy, 1.0/accuracy is the relative error of the approximation. (default: 10000).