Skip to content

Commit

Permalink
[SPARK-28270][SQL][FOLLOW-UP] Explicitly cast into int/long/decimal i…
Browse files Browse the repository at this point in the history
…n udf-aggregates_part1.sql to avoid Python float limitation

## What changes were proposed in this pull request?

The tests added at #25069 seem flaky in some environments. See #25069 (comment)

Python's string representation of floats can make the tests flaky. See https://docs.python.org/3/tutorial/floatingpoint.html.

I think it's just better to explicitly cast everywhere udf returns a float (or a double) to stay safe. (note that we're not targeting the Python <> Scala value conversions - there are inevitable differences between Python and Scala; therefore, other languages' UDFs cannot guarantee the same results between Python and Scala).

This PR proposes to cast cases to long, integer and decimal explicitly to make the test cases robust.

<details><summary>Diff comparing to 'pgSQL/aggregates_part1.sql'</summary>
<p>

```diff
diff --git a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part1.sql.out b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part1.sql.out
index 51ca1d55869..734634b7388 100644
--- a/sql/core/src/test/resources/sql-tests/results/pgSQL/aggregates_part1.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/udf/pgSQL/udf-aggregates_part1.sql.out
 -3,23 +3,23

 -- !query 0
-SELECT avg(four) AS avg_1 FROM onek
+SELECT CAST(avg(udf(four)) AS decimal(10,3)) AS avg_1 FROM onek
 -- !query 0 schema
-struct<avg_1:double>
+struct<avg_1:decimal(10,3)>
 -- !query 0 output
 1.5

 -- !query 1
-SELECT avg(a) AS avg_32 FROM aggtest WHERE a < 100
+SELECT CAST(udf(avg(a)) AS decimal(10,3)) AS avg_32 FROM aggtest WHERE a < 100
 -- !query 1 schema
-struct<avg_32:double>
+struct<avg_32:decimal(10,3)>
 -- !query 1 output
-32.666666666666664
+32.667

 -- !query 2
-select CAST(avg(b) AS Decimal(10,3)) AS avg_107_943 FROM aggtest
+select CAST(avg(udf(b)) AS Decimal(10,3)) AS avg_107_943 FROM aggtest
 -- !query 2 schema
 struct<avg_107_943:decimal(10,3)>
 -- !query 2 output
 -27,39 +27,39  struct<avg_107_943:decimal(10,3)>

 -- !query 3
-SELECT sum(four) AS sum_1500 FROM onek
+SELECT CAST(sum(udf(four)) AS int) AS sum_1500 FROM onek
 -- !query 3 schema
-struct<sum_1500:bigint>
+struct<sum_1500:int>
 -- !query 3 output
 1500

 -- !query 4
-SELECT sum(a) AS sum_198 FROM aggtest
+SELECT udf(sum(a)) AS sum_198 FROM aggtest
 -- !query 4 schema
-struct<sum_198:bigint>
+struct<sum_198:string>
 -- !query 4 output
 198

 -- !query 5
-SELECT sum(b) AS avg_431_773 FROM aggtest
+SELECT CAST(udf(udf(sum(b))) AS decimal(10,3)) AS avg_431_773 FROM aggtest
 -- !query 5 schema
-struct<avg_431_773:double>
+struct<avg_431_773:decimal(10,3)>
 -- !query 5 output
-431.77260909229517
+431.773

 -- !query 6
-SELECT max(four) AS max_3 FROM onek
+SELECT udf(max(four)) AS max_3 FROM onek
 -- !query 6 schema
-struct<max_3:int>
+struct<max_3:string>
 -- !query 6 output
 3

 -- !query 7
-SELECT max(a) AS max_100 FROM aggtest
+SELECT max(CAST(udf(a) AS int)) AS max_100 FROM aggtest
 -- !query 7 schema
 struct<max_100:int>
 -- !query 7 output
 -67,245 +67,246  struct<max_100:int>

 -- !query 8
-SELECT max(aggtest.b) AS max_324_78 FROM aggtest
+SELECT CAST(udf(udf(max(aggtest.b))) AS decimal(10,3)) AS max_324_78 FROM aggtest
 -- !query 8 schema
-struct<max_324_78:float>
+struct<max_324_78:decimal(10,3)>
 -- !query 8 output
 324.78

 -- !query 9
-SELECT stddev_pop(b) FROM aggtest
+SELECT CAST(stddev_pop(udf(b)) AS decimal(10,3)) FROM aggtest
 -- !query 9 schema
-struct<stddev_pop(CAST(b AS DOUBLE)):double>
+struct<CAST(stddev_pop(CAST(udf(b) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
 -- !query 9 output
-131.10703231895047
+131.107

 -- !query 10
-SELECT stddev_samp(b) FROM aggtest
+SELECT CAST(udf(stddev_samp(b)) AS decimal(10,3)) FROM aggtest
 -- !query 10 schema
-struct<stddev_samp(CAST(b AS DOUBLE)):double>
+struct<CAST(udf(stddev_samp(cast(b as double))) AS DECIMAL(10,3)):decimal(10,3)>
 -- !query 10 output
-151.38936080399804
+151.389

 -- !query 11
-SELECT var_pop(b) FROM aggtest
+SELECT CAST(var_pop(udf(b)) AS decimal(10,3)) FROM aggtest
 -- !query 11 schema
-struct<var_pop(CAST(b AS DOUBLE)):double>
+struct<CAST(var_pop(CAST(udf(b) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
 -- !query 11 output
-17189.053923482323
+17189.054

 -- !query 12
-SELECT var_samp(b) FROM aggtest
+SELECT CAST(udf(var_samp(b)) AS decimal(10,3)) FROM aggtest
 -- !query 12 schema
-struct<var_samp(CAST(b AS DOUBLE)):double>
+struct<CAST(udf(var_samp(cast(b as double))) AS DECIMAL(10,3)):decimal(10,3)>
 -- !query 12 output
-22918.738564643096
+22918.739

 -- !query 13
-SELECT stddev_pop(CAST(b AS Decimal(38,0))) FROM aggtest
+SELECT CAST(udf(stddev_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest
 -- !query 13 schema
-struct<stddev_pop(CAST(CAST(b AS DECIMAL(38,0)) AS DOUBLE)):double>
+struct<CAST(udf(stddev_pop(cast(cast(b as decimal(38,0)) as double))) AS DECIMAL(10,3)):decimal(10,3)>
 -- !query 13 output
-131.18117242958306
+131.181

 -- !query 14
-SELECT stddev_samp(CAST(b AS Decimal(38,0))) FROM aggtest
+SELECT CAST(stddev_samp(CAST(udf(b) AS Decimal(38,0))) AS decimal(10,3)) FROM aggtest
 -- !query 14 schema
-struct<stddev_samp(CAST(CAST(b AS DECIMAL(38,0)) AS DOUBLE)):double>
+struct<CAST(stddev_samp(CAST(CAST(udf(b) AS DECIMAL(38,0)) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
 -- !query 14 output
-151.47497042966097
+151.475

 -- !query 15
-SELECT var_pop(CAST(b AS Decimal(38,0))) FROM aggtest
+SELECT CAST(udf(var_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest
 -- !query 15 schema
-struct<var_pop(CAST(CAST(b AS DECIMAL(38,0)) AS DOUBLE)):double>
+struct<CAST(udf(var_pop(cast(cast(b as decimal(38,0)) as double))) AS DECIMAL(10,3)):decimal(10,3)>
 -- !query 15 output
 17208.5

 -- !query 16
-SELECT var_samp(CAST(b AS Decimal(38,0))) FROM aggtest
+SELECT CAST(var_samp(udf(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest
 -- !query 16 schema
-struct<var_samp(CAST(CAST(b AS DECIMAL(38,0)) AS DOUBLE)):double>
+struct<CAST(var_samp(CAST(udf(cast(b as decimal(38,0))) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
 -- !query 16 output
-22944.666666666668
+22944.667

 -- !query 17
-SELECT var_pop(1.0), var_samp(2.0)
+SELECT CAST(udf(var_pop(1.0)) AS int), var_samp(udf(2.0))
 -- !query 17 schema
-struct<var_pop(CAST(1.0 AS DOUBLE)):double,var_samp(CAST(2.0 AS DOUBLE)):double>
+struct<CAST(udf(var_pop(cast(1.0 as double))) AS INT):int,var_samp(CAST(udf(2.0) AS DOUBLE)):double>
 -- !query 17 output
-0.0    NaN
+0      NaN

 -- !query 18
-SELECT stddev_pop(CAST(3.0 AS Decimal(38,0))), stddev_samp(CAST(4.0 AS Decimal(38,0)))
+SELECT CAST(stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))) AS int), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)))
 -- !query 18 schema
-struct<stddev_pop(CAST(CAST(3.0 AS DECIMAL(38,0)) AS DOUBLE)):double,stddev_samp(CAST(CAST(4.0 AS DECIMAL(38,0)) AS DOUBLE)):double>
+struct<CAST(stddev_pop(CAST(udf(cast(3.0 as decimal(38,0))) AS DOUBLE)) AS INT):int,stddev_samp(CAST(CAST(udf(4.0) AS DECIMAL(38,0)) AS DOUBLE)):double>
 -- !query 18 output
-0.0    NaN
+0      NaN

 -- !query 19
-select sum(CAST(null AS int)) from range(1,4)
+select sum(udf(CAST(null AS int))) from range(1,4)
 -- !query 19 schema
-struct<sum(CAST(NULL AS INT)):bigint>
+struct<sum(CAST(udf(cast(null as int)) AS DOUBLE)):double>
 -- !query 19 output
 NULL

 -- !query 20
-select sum(CAST(null AS long)) from range(1,4)
+select sum(udf(CAST(null AS long))) from range(1,4)
 -- !query 20 schema
-struct<sum(CAST(NULL AS BIGINT)):bigint>
+struct<sum(CAST(udf(cast(null as bigint)) AS DOUBLE)):double>
 -- !query 20 output
 NULL

 -- !query 21
-select sum(CAST(null AS Decimal(38,0))) from range(1,4)
+select sum(udf(CAST(null AS Decimal(38,0)))) from range(1,4)
 -- !query 21 schema
-struct<sum(CAST(NULL AS DECIMAL(38,0))):decimal(38,0)>
+struct<sum(CAST(udf(cast(null as decimal(38,0))) AS DOUBLE)):double>
 -- !query 21 output
 NULL

 -- !query 22
-select sum(CAST(null AS DOUBLE)) from range(1,4)
+select sum(udf(CAST(null AS DOUBLE))) from range(1,4)
 -- !query 22 schema
-struct<sum(CAST(NULL AS DOUBLE)):double>
+struct<sum(CAST(udf(cast(null as double)) AS DOUBLE)):double>
 -- !query 22 output
 NULL

 -- !query 23
-select avg(CAST(null AS int)) from range(1,4)
+select avg(udf(CAST(null AS int))) from range(1,4)
 -- !query 23 schema
-struct<avg(CAST(NULL AS INT)):double>
+struct<avg(CAST(udf(cast(null as int)) AS DOUBLE)):double>
 -- !query 23 output
 NULL

 -- !query 24
-select avg(CAST(null AS long)) from range(1,4)
+select avg(udf(CAST(null AS long))) from range(1,4)
 -- !query 24 schema
-struct<avg(CAST(NULL AS BIGINT)):double>
+struct<avg(CAST(udf(cast(null as bigint)) AS DOUBLE)):double>
 -- !query 24 output
 NULL

 -- !query 25
-select avg(CAST(null AS Decimal(38,0))) from range(1,4)
+select avg(udf(CAST(null AS Decimal(38,0)))) from range(1,4)
 -- !query 25 schema
-struct<avg(CAST(NULL AS DECIMAL(38,0))):decimal(38,4)>
+struct<avg(CAST(udf(cast(null as decimal(38,0))) AS DOUBLE)):double>
 -- !query 25 output
 NULL

 -- !query 26
-select avg(CAST(null AS DOUBLE)) from range(1,4)
+select avg(udf(CAST(null AS DOUBLE))) from range(1,4)
 -- !query 26 schema
-struct<avg(CAST(NULL AS DOUBLE)):double>
+struct<avg(CAST(udf(cast(null as double)) AS DOUBLE)):double>
 -- !query 26 output
 NULL

 -- !query 27
-select sum(CAST('NaN' AS DOUBLE)) from range(1,4)
+select sum(CAST(udf('NaN') AS DOUBLE)) from range(1,4)
 -- !query 27 schema
-struct<sum(CAST(NaN AS DOUBLE)):double>
+struct<sum(CAST(udf(NaN) AS DOUBLE)):double>
 -- !query 27 output
 NaN

 -- !query 28
-select avg(CAST('NaN' AS DOUBLE)) from range(1,4)
+select avg(CAST(udf('NaN') AS DOUBLE)) from range(1,4)
 -- !query 28 schema
-struct<avg(CAST(NaN AS DOUBLE)):double>
+struct<avg(CAST(udf(NaN) AS DOUBLE)):double>
 -- !query 28 output
 NaN

 -- !query 30
-SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE))
+SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE))
 FROM (VALUES ('Infinity'), ('1')) v(x)
 -- !query 30 schema
-struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double>
+struct<avg(CAST(udf(x) AS DOUBLE)):double,var_pop(CAST(udf(x) AS DOUBLE)):double>
 -- !query 30 output
 Infinity       NaN

 -- !query 31
-SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE))
+SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE))
 FROM (VALUES ('Infinity'), ('Infinity')) v(x)
 -- !query 31 schema
-struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double>
+struct<avg(CAST(udf(x) AS DOUBLE)):double,var_pop(CAST(udf(x) AS DOUBLE)):double>
 -- !query 31 output
 Infinity       NaN

 -- !query 32
-SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE))
+SELECT avg(CAST(udf(x) AS DOUBLE)), var_pop(CAST(udf(x) AS DOUBLE))
 FROM (VALUES ('-Infinity'), ('Infinity')) v(x)
 -- !query 32 schema
-struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double>
+struct<avg(CAST(udf(x) AS DOUBLE)):double,var_pop(CAST(udf(x) AS DOUBLE)):double>
 -- !query 32 output
 NaN    NaN

 -- !query 33
-SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE))
+SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS int), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3))
 FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x)
 -- !query 33 schema
-struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double>
+struct<CAST(avg(CAST(udf(cast(x as double)) AS DOUBLE)) AS INT):int,CAST(udf(var_pop(cast(x as double))) AS DECIMAL(10,3)):decimal(10,3)>
 -- !query 33 output
-1.00000005E8   2.5
+100000005      2.5

 -- !query 34
-SELECT avg(CAST(x AS DOUBLE)), var_pop(CAST(x AS DOUBLE))
+SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS long), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3))
 FROM (VALUES (7000000000005), (7000000000007)) v(x)
 -- !query 34 schema
-struct<avg(CAST(x AS DOUBLE)):double,var_pop(CAST(x AS DOUBLE)):double>
+struct<CAST(avg(CAST(udf(cast(x as double)) AS DOUBLE)) AS BIGINT):bigint,CAST(udf(var_pop(cast(x as double))) AS DECIMAL(10,3)):decimal(10,3)>
 -- !query 34 output
-7.000000000006E12      1.0
+7000000000006  1

 -- !query 35
-SELECT covar_pop(b, a), covar_samp(b, a) FROM aggtest
+SELECT CAST(udf(covar_pop(b, udf(a))) AS decimal(10,3)), CAST(covar_samp(udf(b), a) as decimal(10,3)) FROM aggtest
 -- !query 35 schema
-struct<covar_pop(CAST(b AS DOUBLE), CAST(a AS DOUBLE)):double,covar_samp(CAST(b AS DOUBLE), CAST(a AS DOUBLE)):double>
+struct<CAST(udf(covar_pop(cast(b as double), cast(udf(a) as double))) AS DECIMAL(10,3)):decimal(10,3),CAST(covar_samp(CAST(udf(b) AS DOUBLE), CAST(a AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
 -- !query 35 output
-653.6289553875104      871.5052738500139
+653.629        871.505

 -- !query 36
-SELECT corr(b, a) FROM aggtest
+SELECT CAST(corr(b, udf(a)) AS decimal(10,3)) FROM aggtest
 -- !query 36 schema
-struct<corr(CAST(b AS DOUBLE), CAST(a AS DOUBLE)):double>
+struct<CAST(corr(CAST(b AS DOUBLE), CAST(udf(a) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
 -- !query 36 output
-0.1396345165178734
+0.14

 -- !query 37
-SELECT count(four) AS cnt_1000 FROM onek
+SELECT count(udf(four)) AS cnt_1000 FROM onek
 -- !query 37 schema
 struct<cnt_1000:bigint>
 -- !query 37 output
 -313,18 +314,18  struct<cnt_1000:bigint>

 -- !query 38
-SELECT count(DISTINCT four) AS cnt_4 FROM onek
+SELECT udf(count(DISTINCT four)) AS cnt_4 FROM onek
 -- !query 38 schema
-struct<cnt_4:bigint>
+struct<cnt_4:string>
 -- !query 38 output
 4

 -- !query 39
-select ten, count(*), sum(four) from onek
+select ten, udf(count(*)), CAST(sum(udf(four)) AS int) from onek
 group by ten order by ten
 -- !query 39 schema
-struct<ten:int,count(1):bigint,sum(four):bigint>
+struct<ten:int,udf(count(1)):string,CAST(sum(CAST(udf(four) AS DOUBLE)) AS INT):int>
 -- !query 39 output
 0      100     100
 1      100     200
 -339,10 +340,10  struct<ten:int,count(1):bigint,sum(four):bigint>

 -- !query 40
-select ten, count(four), sum(DISTINCT four) from onek
+select ten, count(udf(four)), udf(sum(DISTINCT four)) from onek
 group by ten order by ten
 -- !query 40 schema
-struct<ten:int,count(four):bigint,sum(DISTINCT four):bigint>
+struct<ten:int,count(udf(four)):bigint,udf(sum(distinct cast(four as bigint))):string>
 -- !query 40 output
 0      100     2
 1      100     4
 -357,11 +358,11  struct<ten:int,count(four):bigint,sum(DISTINCT four):bigint>

 -- !query 41
-select ten, sum(distinct four) from onek a
+select ten, udf(sum(distinct four)) from onek a
 group by ten
-having exists (select 1 from onek b where sum(distinct a.four) = b.four)
+having exists (select 1 from onek b where udf(sum(distinct a.four)) = b.four)
 -- !query 41 schema
-struct<ten:int,sum(DISTINCT four):bigint>
+struct<ten:int,udf(sum(distinct cast(four as bigint))):string>
 -- !query 41 output
 0      2
 2      2
 -374,23 +375,23  struct<ten:int,sum(DISTINCT four):bigint>
 select ten, sum(distinct four) from onek a
 group by ten
 having exists (select 1 from onek b
-               where sum(distinct a.four + b.four) = b.four)
+               where sum(distinct a.four + b.four) = udf(b.four))
 -- !query 42 schema
 struct<>
 -- !query 42 output
 org.apache.spark.sql.AnalysisException

 Aggregate/Window/Generate expressions are not valid in where clause of the query.
-Expression in where clause: [(sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT)) = CAST(b.`four` AS BIGINT))]
+Expression in where clause: [(sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT)) = CAST(udf(four) AS BIGINT))]
 Invalid expressions: [sum(DISTINCT CAST((outer() + b.`four`) AS BIGINT))];

 -- !query 43
 select
-  (select max((select i.unique2 from tenk1 i where i.unique1 = o.unique1)))
+  (select udf(max((select i.unique2 from tenk1 i where i.unique1 = o.unique1))))
 from tenk1 o
 -- !query 43 schema
 struct<>
 -- !query 43 output
 org.apache.spark.sql.AnalysisException
-cannot resolve '`o.unique1`' given input columns: [i.even, i.fivethous, i.four, i.hundred, i.odd, i.string4, i.stringu1, i.stringu2, i.ten, i.tenthous, i.thousand, i.twenty, i.two, i.twothousand, i.unique1, i.unique2]; line 2 pos 63
+cannot resolve '`o.unique1`' given input columns: [i.even, i.fivethous, i.four, i.hundred, i.odd, i.string4, i.stringu1, i.stringu2, i.ten, i.tenthous, i.thousand, i.twenty, i.two, i.twothousand, i.unique1, i.unique2]; line 2 pos 67
```

</p>
</details>

## How was this patch tested?

Manually tested in local.

Also, with JDK 11:

```
Using /.../jdk-11.0.3.jdk/Contents/Home as default JAVA_HOME.
Note, this will be overridden by -java-home if it is set.
[info] Loading project definition from /.../spark/project
[info] Updating {file:/.../spark/project/}spark-build...
...
[info] SQLQueryTestSuite:
...
[info] - udf/pgSQL/udf-aggregates_part1.sql - Scala UDF (17 seconds, 228 milliseconds)
[info] - udf/pgSQL/udf-aggregates_part1.sql - Regular Python UDF (36 seconds, 170 milliseconds)
[info] - udf/pgSQL/udf-aggregates_part1.sql - Scalar Pandas UDF (41 seconds, 132 milliseconds)
...
```

Closes #25110 from HyukjinKwon/SPARK-28270-1.

Authored-by: HyukjinKwon <gurwls223@apache.org>
Signed-off-by: HyukjinKwon <gurwls223@apache.org>
  • Loading branch information
HyukjinKwon committed Jul 11, 2019
1 parent 7021588 commit 19bcce1
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 91 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
-- Note that currently registered UDF returns a string. So there are some differences, for instance
-- in string cast within UDF in Scala and Python.

SELECT avg(udf(four)) AS avg_1 FROM onek;
SELECT CAST(avg(udf(four)) AS decimal(10,3)) AS avg_1 FROM onek;

SELECT udf(avg(a)) AS avg_32 FROM aggtest WHERE a < 100;
SELECT CAST(udf(avg(a)) AS decimal(10,3)) AS avg_32 FROM aggtest WHERE a < 100;

-- In 7.1, avg(float4) is computed using float8 arithmetic.
-- Round the result to 3 digits to avoid platform-specific results.
Expand All @@ -23,32 +23,32 @@ select CAST(avg(udf(b)) AS Decimal(10,3)) AS avg_107_943 FROM aggtest;
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
-- SELECT avg(gpa) AS avg_3_4 FROM ONLY student;

SELECT sum(udf(four)) AS sum_1500 FROM onek;
SELECT CAST(sum(udf(four)) AS int) AS sum_1500 FROM onek;
SELECT udf(sum(a)) AS sum_198 FROM aggtest;
SELECT udf(udf(sum(b))) AS avg_431_773 FROM aggtest;
SELECT CAST(udf(udf(sum(b))) AS decimal(10,3)) AS avg_431_773 FROM aggtest;
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
-- SELECT sum(gpa) AS avg_6_8 FROM ONLY student;

SELECT udf(max(four)) AS max_3 FROM onek;
SELECT max(udf(a)) AS max_100 FROM aggtest;
SELECT CAST(udf(udf(max(aggtest.b))) AS int) AS max_324_78 FROM aggtest;
SELECT max(CAST(udf(a) AS int)) AS max_100 FROM aggtest;
SELECT CAST(udf(udf(max(aggtest.b))) AS decimal(10,3)) AS max_324_78 FROM aggtest;
-- `student` has a column with data type POINT, which is not supported by Spark [SPARK-27766]
-- SELECT max(student.gpa) AS max_3_7 FROM student;

SELECT CAST(stddev_pop(udf(b)) AS int) FROM aggtest;
SELECT udf(stddev_samp(b)) FROM aggtest;
SELECT CAST(var_pop(udf(b)) as int) FROM aggtest;
SELECT udf(var_samp(b)) FROM aggtest;
SELECT CAST(stddev_pop(udf(b)) AS decimal(10,3)) FROM aggtest;
SELECT CAST(udf(stddev_samp(b)) AS decimal(10,3)) FROM aggtest;
SELECT CAST(var_pop(udf(b)) AS decimal(10,3)) FROM aggtest;
SELECT CAST(udf(var_samp(b)) AS decimal(10,3)) FROM aggtest;

SELECT udf(stddev_pop(CAST(b AS Decimal(38,0)))) FROM aggtest;
SELECT stddev_samp(CAST(udf(b) AS Decimal(38,0))) FROM aggtest;
SELECT udf(var_pop(CAST(b AS Decimal(38,0)))) FROM aggtest;
SELECT var_samp(udf(CAST(b AS Decimal(38,0)))) FROM aggtest;
SELECT CAST(udf(stddev_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest;
SELECT CAST(stddev_samp(CAST(udf(b) AS Decimal(38,0))) AS decimal(10,3)) FROM aggtest;
SELECT CAST(udf(var_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest;
SELECT CAST(var_samp(udf(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest;

-- population variance is defined for a single tuple, sample variance
-- is not
SELECT udf(var_pop(1.0)), var_samp(udf(2.0));
SELECT stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)));
SELECT CAST(udf(var_pop(1.0)) AS int), var_samp(udf(2.0));
SELECT CAST(stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))) AS int), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)));


-- verify correct results for null and NaN inputs
Expand Down Expand Up @@ -76,9 +76,9 @@ FROM (VALUES ('-Infinity'), ('Infinity')) v(x);


-- test accuracy with a large input offset
SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS int), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3))
FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x);
SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS long), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3))
FROM (VALUES (7000000000005), (7000000000007)) v(x);

-- SQL2003 binary aggregates [SPARK-23907]
Expand All @@ -89,8 +89,8 @@ FROM (VALUES (7000000000005), (7000000000007)) v(x);
-- SELECT regr_avgx(b, a), regr_avgy(b, a) FROM aggtest;
-- SELECT regr_r2(b, a) FROM aggtest;
-- SELECT regr_slope(b, a), regr_intercept(b, a) FROM aggtest;
SELECT CAST(udf(covar_pop(b, udf(a))) AS int), CAST(covar_samp(udf(b), a) as int) FROM aggtest;
SELECT corr(b, udf(a)) FROM aggtest;
SELECT CAST(udf(covar_pop(b, udf(a))) AS decimal(10,3)), CAST(covar_samp(udf(b), a) as decimal(10,3)) FROM aggtest;
SELECT CAST(corr(b, udf(a)) AS decimal(10,3)) FROM aggtest;


-- test accum and combine functions directly [SPARK-23907]
Expand Down Expand Up @@ -122,7 +122,7 @@ SELECT corr(b, udf(a)) FROM aggtest;
SELECT count(udf(four)) AS cnt_1000 FROM onek;
SELECT udf(count(DISTINCT four)) AS cnt_4 FROM onek;

select ten, udf(count(*)), sum(udf(four)) from onek
select ten, udf(count(*)), CAST(sum(udf(four)) AS int) from onek
group by ten order by ten;

select ten, count(udf(four)), udf(sum(DISTINCT four)) from onek
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@


-- !query 0
SELECT avg(udf(four)) AS avg_1 FROM onek
SELECT CAST(avg(udf(four)) AS decimal(10,3)) AS avg_1 FROM onek
-- !query 0 schema
struct<avg_1:double>
struct<avg_1:decimal(10,3)>
-- !query 0 output
1.5


-- !query 1
SELECT udf(avg(a)) AS avg_32 FROM aggtest WHERE a < 100
SELECT CAST(udf(avg(a)) AS decimal(10,3)) AS avg_32 FROM aggtest WHERE a < 100
-- !query 1 schema
struct<avg_32:string>
struct<avg_32:decimal(10,3)>
-- !query 1 output
32.666666666666664
32.667


-- !query 2
Expand All @@ -27,11 +27,11 @@ struct<avg_107_943:decimal(10,3)>


-- !query 3
SELECT sum(udf(four)) AS sum_1500 FROM onek
SELECT CAST(sum(udf(four)) AS int) AS sum_1500 FROM onek
-- !query 3 schema
struct<sum_1500:double>
struct<sum_1500:int>
-- !query 3 output
1500.0
1500


-- !query 4
Expand All @@ -43,11 +43,11 @@ struct<sum_198:string>


-- !query 5
SELECT udf(udf(sum(b))) AS avg_431_773 FROM aggtest
SELECT CAST(udf(udf(sum(b))) AS decimal(10,3)) AS avg_431_773 FROM aggtest
-- !query 5 schema
struct<avg_431_773:string>
struct<avg_431_773:decimal(10,3)>
-- !query 5 output
431.77260909229517
431.773


-- !query 6
Expand All @@ -59,99 +59,99 @@ struct<max_3:string>


-- !query 7
SELECT max(udf(a)) AS max_100 FROM aggtest
SELECT max(CAST(udf(a) AS int)) AS max_100 FROM aggtest
-- !query 7 schema
struct<max_100:string>
struct<max_100:int>
-- !query 7 output
56
100


-- !query 8
SELECT CAST(udf(udf(max(aggtest.b))) AS int) AS max_324_78 FROM aggtest
SELECT CAST(udf(udf(max(aggtest.b))) AS decimal(10,3)) AS max_324_78 FROM aggtest
-- !query 8 schema
struct<max_324_78:int>
struct<max_324_78:decimal(10,3)>
-- !query 8 output
324
324.78


-- !query 9
SELECT CAST(stddev_pop(udf(b)) AS int) FROM aggtest
SELECT CAST(stddev_pop(udf(b)) AS decimal(10,3)) FROM aggtest
-- !query 9 schema
struct<CAST(stddev_pop(CAST(udf(b) AS DOUBLE)) AS INT):int>
struct<CAST(stddev_pop(CAST(udf(b) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
-- !query 9 output
131
131.107


-- !query 10
SELECT udf(stddev_samp(b)) FROM aggtest
SELECT CAST(udf(stddev_samp(b)) AS decimal(10,3)) FROM aggtest
-- !query 10 schema
struct<udf(stddev_samp(cast(b as double))):string>
struct<CAST(udf(stddev_samp(cast(b as double))) AS DECIMAL(10,3)):decimal(10,3)>
-- !query 10 output
151.38936080399804
151.389


-- !query 11
SELECT CAST(var_pop(udf(b)) as int) FROM aggtest
SELECT CAST(var_pop(udf(b)) AS decimal(10,3)) FROM aggtest
-- !query 11 schema
struct<CAST(var_pop(CAST(udf(b) AS DOUBLE)) AS INT):int>
struct<CAST(var_pop(CAST(udf(b) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
-- !query 11 output
17189
17189.054


-- !query 12
SELECT udf(var_samp(b)) FROM aggtest
SELECT CAST(udf(var_samp(b)) AS decimal(10,3)) FROM aggtest
-- !query 12 schema
struct<udf(var_samp(cast(b as double))):string>
struct<CAST(udf(var_samp(cast(b as double))) AS DECIMAL(10,3)):decimal(10,3)>
-- !query 12 output
22918.738564643096
22918.739


-- !query 13
SELECT udf(stddev_pop(CAST(b AS Decimal(38,0)))) FROM aggtest
SELECT CAST(udf(stddev_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest
-- !query 13 schema
struct<udf(stddev_pop(cast(cast(b as decimal(38,0)) as double))):string>
struct<CAST(udf(stddev_pop(cast(cast(b as decimal(38,0)) as double))) AS DECIMAL(10,3)):decimal(10,3)>
-- !query 13 output
131.18117242958306
131.181


-- !query 14
SELECT stddev_samp(CAST(udf(b) AS Decimal(38,0))) FROM aggtest
SELECT CAST(stddev_samp(CAST(udf(b) AS Decimal(38,0))) AS decimal(10,3)) FROM aggtest
-- !query 14 schema
struct<stddev_samp(CAST(CAST(udf(b) AS DECIMAL(38,0)) AS DOUBLE)):double>
struct<CAST(stddev_samp(CAST(CAST(udf(b) AS DECIMAL(38,0)) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
-- !query 14 output
151.47497042966097
151.475


-- !query 15
SELECT udf(var_pop(CAST(b AS Decimal(38,0)))) FROM aggtest
SELECT CAST(udf(var_pop(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest
-- !query 15 schema
struct<udf(var_pop(cast(cast(b as decimal(38,0)) as double))):string>
struct<CAST(udf(var_pop(cast(cast(b as decimal(38,0)) as double))) AS DECIMAL(10,3)):decimal(10,3)>
-- !query 15 output
17208.5


-- !query 16
SELECT var_samp(udf(CAST(b AS Decimal(38,0)))) FROM aggtest
SELECT CAST(var_samp(udf(CAST(b AS Decimal(38,0)))) AS decimal(10,3)) FROM aggtest
-- !query 16 schema
struct<var_samp(CAST(udf(cast(b as decimal(38,0))) AS DOUBLE)):double>
struct<CAST(var_samp(CAST(udf(cast(b as decimal(38,0))) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
-- !query 16 output
22944.666666666668
22944.667


-- !query 17
SELECT udf(var_pop(1.0)), var_samp(udf(2.0))
SELECT CAST(udf(var_pop(1.0)) AS int), var_samp(udf(2.0))
-- !query 17 schema
struct<udf(var_pop(cast(1.0 as double))):string,var_samp(CAST(udf(2.0) AS DOUBLE)):double>
struct<CAST(udf(var_pop(cast(1.0 as double))) AS INT):int,var_samp(CAST(udf(2.0) AS DOUBLE)):double>
-- !query 17 output
0.0 NaN
0 NaN


-- !query 18
SELECT stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)))
SELECT CAST(stddev_pop(udf(CAST(3.0 AS Decimal(38,0)))) AS int), stddev_samp(CAST(udf(4.0) AS Decimal(38,0)))
-- !query 18 schema
struct<stddev_pop(CAST(udf(cast(3.0 as decimal(38,0))) AS DOUBLE)):double,stddev_samp(CAST(CAST(udf(4.0) AS DECIMAL(38,0)) AS DOUBLE)):double>
struct<CAST(stddev_pop(CAST(udf(cast(3.0 as decimal(38,0))) AS DOUBLE)) AS INT):int,stddev_samp(CAST(CAST(udf(4.0) AS DECIMAL(38,0)) AS DOUBLE)):double>
-- !query 18 output
0.0 NaN
0 NaN


-- !query 19
Expand Down Expand Up @@ -262,37 +262,37 @@ NaN NaN


-- !query 32
SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS int), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3))
FROM (VALUES (100000003), (100000004), (100000006), (100000007)) v(x)
-- !query 32 schema
struct<avg(CAST(udf(cast(x as double)) AS DOUBLE)):double,udf(var_pop(cast(x as double))):string>
struct<CAST(avg(CAST(udf(cast(x as double)) AS DOUBLE)) AS INT):int,CAST(udf(var_pop(cast(x as double))) AS DECIMAL(10,3)):decimal(10,3)>
-- !query 32 output
1.00000005E8 2.5
100000005 2.5


-- !query 33
SELECT avg(udf(CAST(x AS DOUBLE))), udf(var_pop(CAST(x AS DOUBLE)))
SELECT CAST(avg(udf(CAST(x AS DOUBLE))) AS long), CAST(udf(var_pop(CAST(x AS DOUBLE))) AS decimal(10,3))
FROM (VALUES (7000000000005), (7000000000007)) v(x)
-- !query 33 schema
struct<avg(CAST(udf(cast(x as double)) AS DOUBLE)):double,udf(var_pop(cast(x as double))):string>
struct<CAST(avg(CAST(udf(cast(x as double)) AS DOUBLE)) AS BIGINT):bigint,CAST(udf(var_pop(cast(x as double))) AS DECIMAL(10,3)):decimal(10,3)>
-- !query 33 output
7.000000000006E12 1.0
7000000000006 1


-- !query 34
SELECT CAST(udf(covar_pop(b, udf(a))) AS int), CAST(covar_samp(udf(b), a) as int) FROM aggtest
SELECT CAST(udf(covar_pop(b, udf(a))) AS decimal(10,3)), CAST(covar_samp(udf(b), a) as decimal(10,3)) FROM aggtest
-- !query 34 schema
struct<CAST(udf(covar_pop(cast(b as double), cast(udf(a) as double))) AS INT):int,CAST(covar_samp(CAST(udf(b) AS DOUBLE), CAST(a AS DOUBLE)) AS INT):int>
struct<CAST(udf(covar_pop(cast(b as double), cast(udf(a) as double))) AS DECIMAL(10,3)):decimal(10,3),CAST(covar_samp(CAST(udf(b) AS DOUBLE), CAST(a AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
-- !query 34 output
653 871
653.629 871.505


-- !query 35
SELECT corr(b, udf(a)) FROM aggtest
SELECT CAST(corr(b, udf(a)) AS decimal(10,3)) FROM aggtest
-- !query 35 schema
struct<corr(CAST(b AS DOUBLE), CAST(udf(a) AS DOUBLE)):double>
struct<CAST(corr(CAST(b AS DOUBLE), CAST(udf(a) AS DOUBLE)) AS DECIMAL(10,3)):decimal(10,3)>
-- !query 35 output
0.1396345165178734
0.14


-- !query 36
Expand All @@ -312,21 +312,21 @@ struct<cnt_4:string>


-- !query 38
select ten, udf(count(*)), sum(udf(four)) from onek
select ten, udf(count(*)), CAST(sum(udf(four)) AS int) from onek
group by ten order by ten
-- !query 38 schema
struct<ten:int,udf(count(1)):string,sum(CAST(udf(four) AS DOUBLE)):double>
struct<ten:int,udf(count(1)):string,CAST(sum(CAST(udf(four) AS DOUBLE)) AS INT):int>
-- !query 38 output
0 100 100.0
1 100 200.0
2 100 100.0
3 100 200.0
4 100 100.0
5 100 200.0
6 100 100.0
7 100 200.0
8 100 100.0
9 100 200.0
0 100 100
1 100 200
2 100 100
3 100 200
4 100 100
5 100 200
6 100 100
7 100 200
8 100 100
9 100 200


-- !query 39
Expand Down

0 comments on commit 19bcce1

Please sign in to comment.