[SPARK-47101][SQL] Allow comma to be used in top-level column names a…

…nd remove check nested type definition in `HiveExternalCatalog.verifyDataSchema` ### What changes were proposed in this pull request? > In Hive 0.13 and later, column names can contain any [Unicode](http://en.wikipedia.org/wiki/List_of_Unicode_characters) character (see [HIVE-6013](https://issues.apache.org/jira/browse/HIVE-6013)), however, dot (.) and colon (:) yield errors on querying, so they are disallowed in Hive 1.2.0 (see [HIVE-10120](https://issues.apache.org/jira/browse/HIVE-10120)). Any column name that is specified within backticks (`) is treated literally. Within a backtick string, use double backticks (``) to represent a backtick character. Backtick quotation also enables the use of reserved keywords for table and column identifiers. According to Hive Doc, the column names have the flexibility to contain any character from the Unicode set. This PR makes HiveExternalCatalog.verifyDataSchema: - Allow comma to be used in top-level column names - remove check invalid characters in nested type definition for hard-coded ",:;", which turns out to be incomplete. for example, "^%", etc., are not allowed. They are all delayed to Hive API calls instead. ### Why are the changes needed? improvement ### Does this PR introduce _any_ user-facing change? yes, some special characters are now allowed and errors for some invalid characters now throw Spark Errors instead of Hive Meta Errors ### How was this patch tested? new tests ### Was this patch authored or co-authored using generative AI tooling? no Closes apache#45180 from yaooqinn/SPARK-47101. Authored-by: Kent Yao <yao@apache.org> Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
ericm-db · Mar 5, 2024 · 7f82682 · 7f82682
1 parent affa59b
commit 7f82682
Show file tree

Hide file tree

Showing 5 changed files with 33 additions and 125 deletions.
diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json
@@ -2008,12 +2008,6 @@
     },
     "sqlState" : "HY000"
   },
-  "INVALID_HIVE_COLUMN_NAME" : {
-    "message" : [
-      "Cannot create the table <tableName> having the column <columnName> whose name contains invalid characters <invalidChars> in Hive metastore."
-    ],
-    "sqlState" : "42K05"
-  },
   "INVALID_IDENTIFIER" : {
     "message" : [
       "The identifier <ident> is invalid. Please, consider quoting it with back-quotes as `<ident>`."

diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md
@@ -1190,12 +1190,6 @@ The handle `<handle>` is invalid.
 
 For more details see [INVALID_HANDLE](sql-error-conditions-invalid-handle-error-class.html)
 
-### INVALID_HIVE_COLUMN_NAME
-
-[SQLSTATE: 42K05](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
-
-Cannot create the table `<tableName>` having the column `<columnName>` whose name contains invalid characters `<invalidChars>` in Hive metastore.
-
 ### INVALID_IDENTIFIER
 
 [SQLSTATE: 42602](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)

diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala
@@ -42,13 +42,12 @@ import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils._
 import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.types.DataTypeUtils
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils}
-import org.apache.spark.sql.catalyst.util.TypeUtils.{toSQLId, toSQLValue}
 import org.apache.spark.sql.execution.command.DDLUtils
 import org.apache.spark.sql.execution.datasources.{PartitioningUtils, SourceOptions}
 import org.apache.spark.sql.hive.client.HiveClient
 import org.apache.spark.sql.internal.HiveSerDe
 import org.apache.spark.sql.internal.StaticSQLConf._
-import org.apache.spark.sql.types.{AnsiIntervalType, ArrayType, DataType, MapType, StructType, TimestampNTZType}
+import org.apache.spark.sql.types._
 
 /**
  * A persistent implementation of the system catalog using Hive.
@@ -150,51 +149,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     }
   }
 
-  /**
-   * Checks the validity of data column names. Hive metastore disallows the table to use some
-   * special characters (',', ':', and ';') in data column names, including nested column names.
-   * Partition columns do not have such a restriction. Views do not have such a restriction.
-   */
-  private def verifyDataSchema(
-      tableName: TableIdentifier, tableType: CatalogTableType, dataSchema: StructType): Unit = {
-    if (tableType != VIEW) {
-      val invalidChars = Seq(",", ":", ";")
-      def verifyNestedColumnNames(schema: StructType): Unit = schema.foreach { f =>
-        f.dataType match {
-          case st: StructType => verifyNestedColumnNames(st)
-          case _ if invalidChars.exists(f.name.contains) =>
-            val invalidCharsString = invalidChars.map(c => s"'$c'").mkString(", ")
-            throw new AnalysisException(
-              errorClass = "INVALID_HIVE_COLUMN_NAME",
-              messageParameters = Map(
-                "invalidChars" -> invalidCharsString,
-                "tableName" -> toSQLId(tableName.nameParts),
-                "columnName" -> toSQLId(f.name)
-              ))
-          case _ =>
-        }
-      }
-
-      dataSchema.foreach { f =>
-        f.dataType match {
-          // Checks top-level column names
-          case _ if f.name.contains(",") =>
-            throw new AnalysisException(
-              errorClass = "INVALID_HIVE_COLUMN_NAME",
-              messageParameters = Map(
-                "invalidChars" -> toSQLValue(","),
-                "tableName" -> toSQLId(tableName.nameParts),
-                "columnName" -> toSQLId(f.name)
-              ))
-          // Checks nested column names
-          case st: StructType =>
-            verifyNestedColumnNames(st)
-          case _ =>
-        }
-      }
-    }
-  }
-
   // --------------------------------------------------------------------------
   // Databases
   // --------------------------------------------------------------------------
@@ -260,8 +214,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
     val table = tableDefinition.identifier.table
     requireDbExists(db)
     verifyTableProperties(tableDefinition)
-    verifyDataSchema(
-      tableDefinition.identifier, tableDefinition.tableType, tableDefinition.dataSchema)
 
     if (tableExists(db, table) && !ignoreIfExists) {
       throw new TableAlreadyExistsException(db = db, table = table)
@@ -711,7 +663,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
       newDataSchema: StructType): Unit = withClient {
     requireTableExists(db, table)
     val oldTable = getTable(db, table)
-    verifyDataSchema(oldTable.identifier, oldTable.tableType, newDataSchema)
     val schemaProps =
       tableMetaToTableProps(oldTable, StructType(newDataSchema ++ oldTable.partitionSchema)).toMap
 

diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveDDLSuite.scala
@@ -34,10 +34,10 @@ import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException}
 import org.apache.spark.sql.connector.catalog.CatalogManager
 import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME
 import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_OWNER
+import org.apache.spark.sql.errors.DataTypeErrors.toSQLType
 import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils}
 import org.apache.spark.sql.execution.datasources.orc.OrcCompressionCodec
 import org.apache.spark.sql.execution.datasources.parquet.{ParquetCompressionCodec, ParquetFooterReader}
-import org.apache.spark.sql.functions._
 import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils}
 import org.apache.spark.sql.hive.HiveUtils.{CONVERT_METASTORE_ORC, CONVERT_METASTORE_PARQUET}
 import org.apache.spark.sql.hive.orc.OrcFileOperator
@@ -2876,22 +2876,39 @@ class HiveDDLSuite
     }
   }
 
-  test("SPARK-24681 checks if nested column names do not include ',', ':', and ';'") {
-    Seq("nested,column", "nested:column", "nested;column").foreach { nestedColumnName =>
+  test("SPARK-47101 checks if nested column names do not include invalid characters") {
+    // delimiter characters
+    Seq(",", ":").foreach { c =>
+      val typ = s"array<struct<`abc${c}xyz`:int>>"
+      // The regex is from HiveClientImpl.getSparkSQLDataType, please keep them in sync.
+      val replaced = typ.replaceAll("`", "").replaceAll("(?<=struct<|,)([^,<:]+)(?=:)", "`$1`")
+      withTable("t") {
+        checkError(
+          exception = intercept[SparkException] {
+            sql(s"CREATE TABLE t (a $typ) USING hive")
+          },
+          errorClass = "CANNOT_RECOGNIZE_HIVE_TYPE",
+          parameters = Map(
+            "fieldType" -> toSQLType(replaced),
+            "fieldName" -> "`a`")
+        )
+      }
+    }
+    // other special characters
+    Seq(";", "^", "\\", "/", "%").foreach { c =>
+      val typ = s"array<struct<`abc${c}xyz`:int>>"
+      val replaced = typ.replaceAll("`", "")
+      val msg = s"java.lang.IllegalArgumentException: Error: : expected at the position " +
+        s"16 of '$replaced' but '$c' is found."
       withTable("t") {
         checkError(
           exception = intercept[AnalysisException] {
-            spark.range(1)
-              .select(struct(lit(0).as(nestedColumnName)).as("toplevel"))
-              .write
-              .format("hive")
-              .saveAsTable("t")
+            sql(s"CREATE TABLE t (a $typ) USING hive")
           },
-          errorClass = "INVALID_HIVE_COLUMN_NAME",
+          errorClass = "_LEGACY_ERROR_TEMP_3065",
           parameters = Map(
-            "invalidChars" -> "',', ':', ';'",
-            "tableName" -> "`spark_catalog`.`default`.`t`",
-            "columnName" -> s"`$nestedColumnName`")
+            "clazz" -> "org.apache.hadoop.hive.ql.metadata.HiveException",
+            "msg" -> msg)
         )
       }
     }
@@ -3385,24 +3402,11 @@ class HiveDDLSuite
     }
   }
 
-  test("SPARK-44911: Create the table with invalid column") {
+  test("SPARK-47101: comma is allowed in column name") {
     val tbl = "t1"
     withTable(tbl) {
-      val e = intercept[AnalysisException] {
-        sql(
-          s"""
-             |CREATE TABLE t1
-             |STORED AS parquet
-             |SELECT id, DATE'2018-01-01' + MAKE_DT_INTERVAL(0, id) FROM RANGE(0, 10)
-         """.stripMargin)
-      }
-      checkError(e,
-        errorClass = "INVALID_HIVE_COLUMN_NAME",
-        parameters = Map(
-          "invalidChars" -> "','",
-          "tableName" -> "`spark_catalog`.`default`.`t1`",
-          "columnName" -> "`DATE '2018-01-01' + make_dt_interval(0, id, 0, 0`.`000000)`")
-      )
+      sql("CREATE TABLE t1 STORED AS parquet SELECT id as `a,b` FROM range(1)")
+      checkAnswer(sql("SELECT * FROM t1"), Row(0))
     }
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2141,41 +2141,6 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi
     }
   }
 
-  test("Auto alias construction of get_json_object") {
-    val df = Seq(("1", """{"f1": "value1", "f5": 5.23}""")).toDF("key", "jstring")
-
-    withTable("t") {
-      val e = intercept[AnalysisException] {
-        df.select($"key", functions.get_json_object($"jstring", "$.f1"))
-          .write.format("hive").saveAsTable("t")
-      }
-      checkError(e,
-        errorClass = "INVALID_HIVE_COLUMN_NAME",
-        parameters = Map(
-          "invalidChars" -> "','",
-          "tableName" -> "`spark_catalog`.`default`.`t`",
-          "columnName" -> "`get_json_object(jstring, $`.`f1)`")
-      )
-    }
-
-    withTempView("tempView") {
-      withTable("t") {
-        df.createTempView("tempView")
-        val e = intercept[AnalysisException] {
-          sql("CREATE TABLE t USING hive AS " +
-            "SELECT key, get_json_object(jstring, '$.f1') FROM tempView")
-        }
-        checkError(e,
-          errorClass = "INVALID_HIVE_COLUMN_NAME",
-          parameters = Map(
-            "invalidChars" -> "','",
-            "tableName" -> "`spark_catalog`.`default`.`t`",
-            "columnName" -> "`get_json_object(jstring, $`.`f1)`")
-        )
-      }
-    }
-  }
-
   test("SPARK-19912 String literals should be escaped for Hive metastore partition pruning") {
     withTable("spark_19912") {
       Seq(