apache · AngersZhuuuu · Dec 29, 2020 · Dec 29, 2020 · Dec 29, 2020 · Dec 29, 2020
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/BaseScriptTransformationExec.scala
@@ -30,7 +30,7 @@ import org.apache.spark.{SparkException, SparkFiles, TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
-import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Cast, Expression, GenericInternalRow, UnsafeProjection}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet, Cast, Expression, GenericInternalRow, JsonToStructs, Literal, StructsToJson, UnsafeProjection}
 import org.apache.spark.sql.catalyst.plans.logical.ScriptInputOutputSchema
 import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.catalyst.util.{DateTimeUtils, IntervalUtils}
@@ -47,7 +47,14 @@ trait BaseScriptTransformationExec extends UnaryExecNode {
   def ioschema: ScriptTransformationIOSchema
 
   protected lazy val inputExpressionsWithoutSerde: Seq[Expression] = {
-    input.map(Cast(_, StringType).withTimeZone(conf.sessionLocalTimeZone))
+    input.map { in =>
+      in.dataType match {
+        case _: ArrayType | _: MapType | _: StructType =>
+          new StructsToJson(ioschema.inputSerdeProps.toMap, in)
+            .withTimeZone(conf.sessionLocalTimeZone)
+        case _ => Cast(in, StringType).withTimeZone(conf.sessionLocalTimeZone)
+      }
+    }
   }
 
   override def producedAttributes: AttributeSet = outputSet -- inputSet
@@ -220,6 +227,11 @@ trait BaseScriptTransformationExec extends UnaryExecNode {
       case CalendarIntervalType => wrapperConvertException(
         data => IntervalUtils.stringToInterval(UTF8String.fromString(data)),
         converter)
+      case _: ArrayType | _: MapType | _: StructType =>
+        val complexTypeFactory = JsonToStructs(attr.dataType,
+        ioschema.outputSerdeProps.toMap, Literal(null), Some(conf.sessionLocalTimeZone))
+        wrapperConvertException(data =>
+          complexTypeFactory.nullSafeEval(UTF8String.fromString(data)), any => any)
       case udt: UserDefinedType[_] =>
         wrapperConvertException(data => udt.deserialize(data), converter)
       case dt =>

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/BaseScriptTransformationSuite.scala
@@ -302,14 +302,16 @@ abstract class BaseScriptTransformationSuite extends SparkPlanTest with SQLTestU
           script = "cat",
           output = Seq(
             AttributeReference("a", CalendarIntervalType)(),
-            AttributeReference("b", StringType)(),
-            AttributeReference("c", StringType)(),
-            AttributeReference("d", StringType)(),
+            AttributeReference("b", ArrayType(IntegerType))(),
+            AttributeReference("c", MapType(StringType, IntegerType))(),
+            AttributeReference("d", StructType(
+              Array(StructField("_1", IntegerType),
+                StructField("_2", IntegerType))))(),
             AttributeReference("e", new SimpleTupleUDT)()),
           child = child,
           ioschema = defaultIOSchema
         ),
-        df.select('a, 'b.cast("string"), 'c.cast("string"), 'd.cast("string"), 'e).collect())
+        df.select('a, 'b, 'c, 'd, 'e).collect())
     }
   }
 
@@ -471,6 +473,60 @@ abstract class BaseScriptTransformationSuite extends SparkPlanTest with SQLTestU
     }
   }
 
+  test("SPARK-31936: Script transform support ArrayType/MapType/StructType (no serde)") {
+    assume(TestUtils.testCommandAvailable("python"))
+    withTempView("v") {
+      val df = Seq(
+        (Array(0, 1, 2), Array(Array(0, 1), Array(2)),
+          Map("a" -> 1), Map("b" -> Array("a", "b"))),
+        (Array(3, 4, 5), Array(Array(3, 4), Array(5)),
+          Map("b" -> 2), Map("c" -> Array("c", "d"))),
+        (Array(6, 7, 8), Array(Array(6, 7), Array(8)),
+          Map("c" -> 3), Map("d" -> Array("e", "f")))
+      ).toDF("a", "b", "c", "d")
+        .select('a, 'b, 'c, 'd,
+          struct('a, 'b).as("e"),
+          struct('a, 'd).as("f"),
+          struct(struct('a, 'b), struct('a, 'd)).as("g")
+        )
+
+      checkAnswer(
+        df,
+        (child: SparkPlan) => createScriptTransformationExec(
+          input = Seq(
+            df.col("a").expr,
+            df.col("b").expr,
+            df.col("c").expr,
+            df.col("d").expr,
+            df.col("e").expr,
+            df.col("f").expr,
+            df.col("g").expr),
+          script = "cat",
+          output = Seq(
+            AttributeReference("a", ArrayType(IntegerType))(),
+            AttributeReference("b", ArrayType(ArrayType(IntegerType)))(),
+            AttributeReference("c", MapType(StringType, IntegerType))(),
+            AttributeReference("d", MapType(StringType, ArrayType(StringType)))(),
+            AttributeReference("e", StructType(
+              Array(StructField("a", ArrayType(IntegerType)),
+                StructField("b", ArrayType(ArrayType(IntegerType))))))(),
+            AttributeReference("f", StructType(
+              Array(StructField("a", ArrayType(IntegerType)),
+                StructField("d", MapType(StringType, ArrayType(StringType))))))(),
+            AttributeReference("g", StructType(
+              Array(StructField("col1", StructType(
+                Array(StructField("a", ArrayType(IntegerType)),
+                  StructField("b", ArrayType(ArrayType(IntegerType)))))),
+                StructField("col2", StructType(
+                  Array(StructField("a", ArrayType(IntegerType)),
+                    StructField("d", MapType(StringType, ArrayType(StringType)))))))))()),
+          child = child,
+          ioschema = defaultIOSchema
+        ),
+        df.select('a, 'b, 'c, 'd, 'e, 'f, 'g).collect())
+    }
+  }
+
   test("SPARK-33934: Add SparkFile's root dir to env property PATH") {
     assume(TestUtils.testCommandAvailable("python"))
     val scriptFilePath = copyAndGetResourceFile("test_script.py", ".py").getAbsoluteFile

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkScriptTransformationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SparkScriptTransformationSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.execution
 
-import org.apache.spark.{SparkException, TestUtils}
+import org.apache.spark.TestUtils
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
 import org.apache.spark.sql.catalyst.parser.ParseException
 import org.apache.spark.sql.test.SharedSparkSession
@@ -59,44 +59,4 @@ class SparkScriptTransformationSuite extends BaseScriptTransformationSuite with
       assert(e.contains("TRANSFORM with serde is only supported in hive mode"))
     }
   }
-
-  test("SPARK-32106: TRANSFORM doesn't support ArrayType/MapType/StructType " +
-    "as output data type (no serde)") {
-    assume(TestUtils.testCommandAvailable("/bin/bash"))
-    // check for ArrayType
-    val e1 = intercept[SparkException] {
-      sql(
-        """
-          |SELECT TRANSFORM(a)
-          |USING 'cat' AS (a array<int>)
-          |FROM VALUES (array(1, 1), map('1', 1), struct(1, 'a')) t(a, b, c)
-        """.stripMargin).collect()
-    }.getMessage
-    assert(e1.contains("SparkScriptTransformation without serde does not support" +
-      " ArrayType as output data type"))
-
-    // check for MapType
-    val e2 = intercept[SparkException] {
-      sql(
-        """
-          |SELECT TRANSFORM(b)
-          |USING 'cat' AS (b map<int, string>)
-          |FROM VALUES (array(1, 1), map('1', 1), struct(1, 'a')) t(a, b, c)
-        """.stripMargin).collect()
-    }.getMessage
-    assert(e2.contains("SparkScriptTransformation without serde does not support" +
-      " MapType as output data type"))
-
-    // check for StructType
-    val e3 = intercept[SparkException] {
-      sql(
-        """
-          |SELECT TRANSFORM(c)
-          |USING 'cat' AS (c struct<col1:int, col2:string>)
-          |FROM VALUES (array(1, 1), map('1', 1), struct(1, 'a')) t(a, b, c)
-        """.stripMargin).collect()
-    }.getMessage
-    assert(e3.contains("SparkScriptTransformation without serde does not support" +
-      " StructType as output data type"))
-  }
 }