From 85d8d62216d3b830cc5af3dec05422a9cda4cea0 Mon Sep 17 00:00:00 2001
From: Kent Yao <yao@apache.org>
Date: Mon, 17 Jul 2023 13:35:20 +0800
Subject: [PATCH] [SPARK-44409][SQL] Handle char/varchar in Dataset.to to keep
 consistent with others

### What changes were proposed in this pull request?

This PR replaces user-specified char/varchar in dataset.to API to make it consistent with other dataset/dataframe APIs

### Why are the changes needed?

Currently, `INVALID_COLUMN_OR_FIELD_DATA_TYPE` is thrown when converting char to char, varchar to varchar.

### Does this PR introduce _any_ user-facing change?

yes, different error classes when LEGACY_CHAR_VARCHAR_AS_STRING off and using char/vachar in `dataset.to`. While LEGACY_CHAR_VARCHAR_AS_STRING on, `to` is able to handle char/varchar

### How was this patch tested?

new tests

Closes #41992 from yaooqinn/SPARK-44409.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Kent Yao <yao@apache.org>
---
 .../scala/org/apache/spark/sql/Dataset.scala    |  5 +++--
 .../apache/spark/sql/CharVarcharTestSuite.scala | 17 +++++++++++++++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
index 76caaabd94278..7eef2e9bbac8f 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala
@@ -48,7 +48,7 @@ import org.apache.spark.sql.catalyst.plans._
 import org.apache.spark.sql.catalyst.plans.logical._
 import org.apache.spark.sql.catalyst.trees.TreeNodeTag
 import org.apache.spark.sql.catalyst.types.DataTypeUtils.toAttributes
-import org.apache.spark.sql.catalyst.util.IntervalUtils
+import org.apache.spark.sql.catalyst.util.{CharVarcharUtils, IntervalUtils}
 import org.apache.spark.sql.catalyst.util.TypeUtils.toSQLId
 import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors}
 import org.apache.spark.sql.execution._
@@ -510,7 +510,8 @@ class Dataset[T] private[sql](
    * @since 3.4.0
    */
   def to(schema: StructType): DataFrame = withPlan {
-    Project.matchSchema(logicalPlan, schema, sparkSession.sessionState.conf)
+    val replaced = CharVarcharUtils.failIfHasCharVarchar(schema).asInstanceOf[StructType]
+    Project.matchSchema(logicalPlan, replaced, sparkSession.sessionState.conf)
   }
 
   /**
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
index 40b31e5850f16..4a7632486c046 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/CharVarcharTestSuite.scala
@@ -931,6 +931,23 @@ class BasicCharVarcharTestSuite extends QueryTest with SharedSparkSession {
       }
     }
   }
+
+  test("SPARK-44409: Handle char/varchar in Dataset.to to keep consistent with others") {
+    val newSchema = StructType.fromDDL("v varchar(255), c char(10)")
+    withTable("t") {
+      sql("CREATE TABLE t(c char(10), v varchar(255)) USING parquet")
+      sql("INSERT INTO t VALUES('spark', 'awesome')")
+      val df = sql("SELECT * FROM t")
+      checkError(exception = intercept[AnalysisException] {
+        df.to(newSchema)
+      }, errorClass = "UNSUPPORTED_CHAR_OR_VARCHAR_AS_STRING", parameters = Map.empty)
+      withSQLConf((SQLConf.LEGACY_CHAR_VARCHAR_AS_STRING.key, "true")) {
+        val df1 = df.to(newSchema)
+        checkAnswer(df1, df.select("v", "c"))
+        assert(df1.schema.last.dataType === StringType)
+      }
+    }
+  }
 }
 
 class FileSourceCharVarcharTestSuite extends CharVarcharTestSuite with SharedSparkSession {