[SPARK-44409][SQL] Handle char/varchar in Dataset.to to keep consiste…

…nt with others ### What changes were proposed in this pull request? This PR replaces user-specified char/varchar in dataset.to API to make it consistent with other dataset/dataframe APIs ### Why are the changes needed? Currently, `INVALID_COLUMN_OR_FIELD_DATA_TYPE` is thrown when converting char to char, varchar to varchar. ### Does this PR introduce _any_ user-facing change? yes, different error classes when LEGACY_CHAR_VARCHAR_AS_STRING off and using char/vachar in `dataset.to`. While LEGACY_CHAR_VARCHAR_AS_STRING on, `to` is able to handle char/varchar ### How was this patch tested? new tests Closes apache#41992 from yaooqinn/SPARK-44409. Authored-by: Kent Yao <yao@apache.org> Signed-off-by: Kent Yao <yao@apache.org>
ueshin · Jul 17, 2023 · 85d8d62 · 85d8d62
1 parent 9a18c06
commit 85d8d62
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 2 deletions.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -48,7 +48,7 @@ import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.trees.TreeNodeTag
 import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes
-import org.apache.spark.sql.catalyst.util.IntervalUtils
+import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, IntervalUtils}
 import org.apache.spark.sql.catalyst.util.TypeUtils.toSQLId
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
 import org.apache.spark.sql.execution._
@@ -510,7 +510,8 @@ class Dataset[T] private[sql](
    * @since 3.4.0
    */
   def to(schema: StructType): DataFrame = withPlan {
-    Project.matchSchema(logicalPlan, schema, sparkSession.sessionState.conf)
+    val replaced = CharVarcharUtils.failIfHasCharVarchar(schema).asInstanceOf[StructType]
+    Project.matchSchema(logicalPlan, replaced, sparkSession.sessionState.conf)
   }
 
   /**

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
@@ -931,6 +931,23 @@ class BasicCharVarcharTestSuite extends QueryTest with SharedSparkSession {
       }
     }
   }
+
+  test("SPARK-44409: Handle char/varchar in Dataset.to to keep consistent with others") {
+    val newSchema = StructType.fromDDL("v varchar(255), c char(10)")
+    withTable("t") {
+      sql("CREATE TABLE t(c char(10), v varchar(255)) USING parquet")
+      sql("INSERT INTO t VALUES('spark', 'awesome')")
+      val df = sql("SELECT * FROM t")
+      checkError(exception = intercept[AnalysisException] {
+        df.to(newSchema)
+      }, errorClass = "UNSUPPORTED_CHAR_OR_VARCHAR_AS_STRING", parameters = Map.empty)
+      withSQLConf((SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING.key, "true")) {
+        val df1 = df.to(newSchema)
+        checkAnswer(df1, df.select("v", "c"))
+        assert(df1.schema.last.dataType === StringType)
+      }
+    }
+  }
 }
 
 class FileSourceCharVarcharTestSuite extends CharVarcharTestSuite with SharedSparkSession {