From 85d8d62216d3b830cc5af3dec05422a9cda4cea0 Mon Sep 17 00:00:00 2001 From: Kent Yao Date: Mon, 17 Jul 2023 13:35:20 +0800 Subject: [PATCH] [SPARK-44409][SQL] Handle char/varchar in Dataset.to to keep consistent with others ### What changes were proposed in this pull request? This PR replaces user-specified char/varchar in dataset.to API to make it consistent with other dataset/dataframe APIs ### Why are the changes needed? Currently, `INVALID_COLUMN_OR_FIELD_DATA_TYPE` is thrown when converting char to char, varchar to varchar. ### Does this PR introduce _any_ user-facing change? yes, different error classes when LEGACY_CHAR_VARCHAR_AS_STRING off and using char/vachar in `dataset.to`. While LEGACY_CHAR_VARCHAR_AS_STRING on, `to` is able to handle char/varchar ### How was this patch tested? new tests Closes #41992 from yaooqinn/SPARK-44409. Authored-by: Kent Yao Signed-off-by: Kent Yao --- .../scala/org/apache/spark/sql/Dataset.scala | 5 +++-- .../apache/spark/sql/CharVarcharTestSuite.scala | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala index 76caaabd94278..7eef2e9bbac8f 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -48,7 +48,7 @@ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.trees.TreeNodeTag import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes -import org.apache.spark.sql.catalyst.util.IntervalUtils +import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, IntervalUtils} import org.apache.spark.sql.catalyst.util.TypeUtils.toSQLId import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} import org.apache.spark.sql.execution._ @@ -510,7 +510,8 @@ class Dataset[T] private[sql]( * @since 3.4.0 */ def to(schema: StructType): DataFrame = withPlan { - Project.matchSchema(logicalPlan, schema, sparkSession.sessionState.conf) + val replaced = CharVarcharUtils.failIfHasCharVarchar(schema).asInstanceOf[StructType] + Project.matchSchema(logicalPlan, replaced, sparkSession.sessionState.conf) } /** diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala index 40b31e5850f16..4a7632486c046 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala @@ -931,6 +931,23 @@ class BasicCharVarcharTestSuite extends QueryTest with SharedSparkSession { } } } + + test("SPARK-44409: Handle char/varchar in Dataset.to to keep consistent with others") { + val newSchema = StructType.fromDDL("v varchar(255), c char(10)") + withTable("t") { + sql("CREATE TABLE t(c char(10), v varchar(255)) USING parquet") + sql("INSERT INTO t VALUES('spark', 'awesome')") + val df = sql("SELECT * FROM t") + checkError(exception = intercept[AnalysisException] { + df.to(newSchema) + }, errorClass = "UNSUPPORTED_CHAR_OR_VARCHAR_AS_STRING", parameters = Map.empty) + withSQLConf((SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING.key, "true")) { + val df1 = df.to(newSchema) + checkAnswer(df1, df.select("v", "c")) + assert(df1.schema.last.dataType === StringType) + } + } + } } class FileSourceCharVarcharTestSuite extends CharVarcharTestSuite with SharedSparkSession {