NVIDIA · thirtiseven · Jun 8, 2024 · May 29, 2024 · May 30, 2024 · jlowe
diff --git a/...est/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsDataFrameAggregateSuite.scala b/...est/spark330/scala/org/apache/spark/sql/rapids/suites/RapidsDataFrameAggregateSuite.scala
@@ -19,12 +19,67 @@
 spark-rapids-shim-json-lines ***/
 package org.apache.spark.sql.rapids.suites
 
-import org.apache.spark.sql.DataFrameAggregateSuite
+import org.apache.spark.sql.{DataFrameAggregateSuite, Row}
+import org.apache.spark.sql.functions._
 import org.apache.spark.sql.rapids.utils.RapidsSQLTestsTrait
+import org.apache.spark.sql.types._
 
 class RapidsDataFrameAggregateSuite extends DataFrameAggregateSuite with RapidsSQLTestsTrait {
-  // example to show how to replace the logic of an excluded test case in Vanilla Spark
-  testRapids("collect functions" ) {  // "collect functions" was excluded at RapidsTestSettings
-    // println("...")
+  import testImplicits._
+
+  testRapids("collect functions") {
+    val df = Seq((1, 2), (2, 2), (3, 4)).toDF("a", "b")
+    checkAnswer(
+      df.select(sort_array(collect_list($"a")), sort_array(collect_list($"b"))),
+      Seq(Row(Seq(1, 2, 3), Seq(2, 2, 4)))
+    )
+    checkAnswer(
+      df.select(sort_array(collect_set($"a")), sort_array(collect_set($"b"))),
+      Seq(Row(Seq(1, 2, 3), Seq(2, 4)))
+    )
+
+    checkDataset(
+      df.select(sort_array(collect_set($"a")).as("aSet")).as[Set[Int]],
+      Set(1, 2, 3))
+    checkDataset(
+      df.select(sort_array(collect_set($"b")).as("bSet")).as[Set[Int]],
+      Set(2, 4))
+    checkDataset(
+      df.select(sort_array(collect_set($"a")), sort_array(collect_set($"b")))
+        .as[(Set[Int], Set[Int])], Seq(Set(1, 2, 3) -> Set(2, 4)): _*)
+  }
+
+  testRapids("collect functions structs") {
+    val df = Seq((1, 2, 2), (2, 2, 2), (3, 4, 1))
+      .toDF("a", "x", "y")
+      .select($"a", struct($"x", $"y").as("b"))
+    checkAnswer(
+      df.select(sort_array(collect_list($"a")), sort_array(collect_list($"b"))),
+      Seq(Row(Seq(1, 2, 3), Seq(Row(2, 2), Row(2, 2), Row(4, 1))))
+    )
+    checkAnswer(
+      df.select(sort_array(collect_set($"a")), sort_array(collect_set($"b"))),
+      Seq(Row(Seq(1, 2, 3), Seq(Row(2, 2), Row(4, 1))))
+    )
+  }
+
+  testRapids("SPARK-17641: collect functions should not collect null values") {
+    val df = Seq(("1", 2), (null, 2), ("1", 4)).toDF("a", "b")
+    checkAnswer(
+      df.select(sort_array(collect_list($"a")), sort_array(collect_list($"b"))),
+      Seq(Row(Seq("1", "1"), Seq(2, 2, 4)))
+    )
+    checkAnswer(
+      df.select(sort_array(collect_set($"a")), sort_array(collect_set($"b"))),
+      Seq(Row(Seq("1"), Seq(2, 4)))
+    )
+  }
+
+  testRapids("collect functions should be able to cast to array type with no null values") {
+    val df = Seq(1, 2).toDF("a")
+    checkAnswer(df.select(sort_array(collect_list("a")) cast ArrayType(IntegerType, false)),
+      Seq(Row(Seq(1, 2))))
+    checkAnswer(df.select(sort_array(collect_set("a")) cast ArrayType(FloatType, false)),
+      Seq(Row(Seq(1.0, 2.0))))
   }
 }
diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/BackendTestSettings.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/BackendTestSettings.scala
@@ -83,6 +83,7 @@ abstract class BackendTestSettings {
   // or a description like "This simply can't work on GPU".
   // It should never be "unknown" or "need investigation"
   case class KNOWN_ISSUE(reason: String) extends ExcludeReason
+  case class ADJUST_UT(reason: String) extends ExcludeReason
   case class WONT_FIX_ISSUE(reason: String) extends ExcludeReason
 
 

diff --git a/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala b/tests/src/test/spark330/scala/org/apache/spark/sql/rapids/utils/RapidsTestSettings.scala
@@ -35,11 +35,11 @@ class RapidsTestSettings extends BackendTestSettings {
     .exclude("casting to fixed-precision decimals", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10771"))
     .exclude("SPARK-32828: cast from a derived user-defined type to a base type", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10771"))
   enableSuite[RapidsDataFrameAggregateSuite]
-    .exclude("collect functions", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10772"))
-    .exclude("collect functions structs", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10772"))
-    .exclude("collect functions should be able to cast to array type with no null values", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10772"))
-    .exclude("SPARK-17641: collect functions should not collect null values", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10772"))
-    .exclude("SPARK-19471: AggregationIterator does not initialize the generated result projection before using it", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10772"))
+    .exclude("collect functions", ADJUST_UT("order of elements in the array is non-deterministic in collect"))
+    .exclude("collect functions structs", ADJUST_UT("order of elements in the array is non-deterministic in collect"))
+    .exclude("collect functions should be able to cast to array type with no null values", ADJUST_UT("order of elements in the array is non-deterministic in collect"))
+    .exclude("SPARK-17641: collect functions should not collect null values", ADJUST_UT("order of elements in the array is non-deterministic in collect"))
+    .exclude("SPARK-19471: AggregationIterator does not initialize the generated result projection before using it", WONT_FIX_ISSUE("Codegen related UT, not applicable for GPU"))
     .exclude("SPARK-24788: RelationalGroupedDataset.toString with unresolved exprs should not fail", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10801"))
   enableSuite[RapidsJsonExpressionsSuite]
     .exclude("from_json - invalid data", KNOWN_ISSUE("https://github.com/NVIDIA/spark-rapids/issues/10849"))