Skip to content

Commit

Permalink
[SPARK-47101][SQL] Allow comma to be used in top-level column names a…
Browse files Browse the repository at this point in the history
…nd remove check nested type definition in `HiveExternalCatalog.verifyDataSchema`

### What changes were proposed in this pull request?

> In Hive 0.13 and later, column names can contain any [Unicode](http://en.wikipedia.org/wiki/List_of_Unicode_characters) character (see [HIVE-6013](https://issues.apache.org/jira/browse/HIVE-6013)), however, dot (.) and colon (:) yield errors on querying, so they are disallowed in Hive 1.2.0 (see [HIVE-10120](https://issues.apache.org/jira/browse/HIVE-10120)). Any column name that is specified within backticks (`) is treated literally. Within a backtick string, use double backticks (``) to represent a backtick character. Backtick quotation also enables the use of reserved keywords for table and column identifiers.

According to Hive Doc, the column names have the flexibility to contain any character from the Unicode set.

This PR makes HiveExternalCatalog.verifyDataSchema:

- Allow comma to be used in top-level column names
- remove check invalid characters in nested type definition for hard-coded ",:;", which turns out to be incomplete. for example, "^%", etc., are not allowed. They are all delayed to Hive API calls instead.

### Why are the changes needed?

improvement

### Does this PR introduce _any_ user-facing change?

yes, some special characters are now allowed and errors for some invalid characters now throw Spark Errors instead of Hive Meta Errors

### How was this patch tested?

new tests

### Was this patch authored or co-authored using generative AI tooling?

no

Closes apache#45180 from yaooqinn/SPARK-47101.

Authored-by: Kent Yao <yao@apache.org>
Signed-off-by: Dongjoon Hyun <dhyun@apple.com>
  • Loading branch information
yaooqinn authored and ericm-db committed Mar 5, 2024
1 parent affa59b commit 7f82682
Show file tree
Hide file tree
Showing 5 changed files with 33 additions and 125 deletions.
6 changes: 0 additions & 6 deletions common/utils/src/main/resources/error/error-classes.json
Original file line number Diff line number Diff line change
Expand Up @@ -2008,12 +2008,6 @@
},
"sqlState" : "HY000"
},
"INVALID_HIVE_COLUMN_NAME" : {
"message" : [
"Cannot create the table <tableName> having the column <columnName> whose name contains invalid characters <invalidChars> in Hive metastore."
],
"sqlState" : "42K05"
},
"INVALID_IDENTIFIER" : {
"message" : [
"The identifier <ident> is invalid. Please, consider quoting it with back-quotes as `<ident>`."
Expand Down
6 changes: 0 additions & 6 deletions docs/sql-error-conditions.md
Original file line number Diff line number Diff line change
Expand Up @@ -1190,12 +1190,6 @@ The handle `<handle>` is invalid.

For more details see [INVALID_HANDLE](sql-error-conditions-invalid-handle-error-class.html)

### INVALID_HIVE_COLUMN_NAME

[SQLSTATE: 42K05](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)

Cannot create the table `<tableName>` having the column `<columnName>` whose name contains invalid characters `<invalidChars>` in Hive metastore.

### INVALID_IDENTIFIER

[SQLSTATE: 42602](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,13 +42,12 @@ import org.apache.spark.sql.catalyst.catalog.ExternalCatalogUtils._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.types.DataTypeUtils
import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils}
import org.apache.spark.sql.catalyst.util.TypeUtils.{toSQLId, toSQLValue}
import org.apache.spark.sql.execution.command.DDLUtils
import org.apache.spark.sql.execution.datasources.{PartitioningUtils, SourceOptions}
import org.apache.spark.sql.hive.client.HiveClient
import org.apache.spark.sql.internal.HiveSerDe
import org.apache.spark.sql.internal.StaticSQLConf._
import org.apache.spark.sql.types.{AnsiIntervalType, ArrayType, DataType, MapType, StructType, TimestampNTZType}
import org.apache.spark.sql.types._

/**
* A persistent implementation of the system catalog using Hive.
Expand Down Expand Up @@ -150,51 +149,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
}
}

/**
* Checks the validity of data column names. Hive metastore disallows the table to use some
* special characters (',', ':', and ';') in data column names, including nested column names.
* Partition columns do not have such a restriction. Views do not have such a restriction.
*/
private def verifyDataSchema(
tableName: TableIdentifier, tableType: CatalogTableType, dataSchema: StructType): Unit = {
if (tableType != VIEW) {
val invalidChars = Seq(",", ":", ";")
def verifyNestedColumnNames(schema: StructType): Unit = schema.foreach { f =>
f.dataType match {
case st: StructType => verifyNestedColumnNames(st)
case _ if invalidChars.exists(f.name.contains) =>
val invalidCharsString = invalidChars.map(c => s"'$c'").mkString(", ")
throw new AnalysisException(
errorClass = "INVALID_HIVE_COLUMN_NAME",
messageParameters = Map(
"invalidChars" -> invalidCharsString,
"tableName" -> toSQLId(tableName.nameParts),
"columnName" -> toSQLId(f.name)
))
case _ =>
}
}

dataSchema.foreach { f =>
f.dataType match {
// Checks top-level column names
case _ if f.name.contains(",") =>
throw new AnalysisException(
errorClass = "INVALID_HIVE_COLUMN_NAME",
messageParameters = Map(
"invalidChars" -> toSQLValue(","),
"tableName" -> toSQLId(tableName.nameParts),
"columnName" -> toSQLId(f.name)
))
// Checks nested column names
case st: StructType =>
verifyNestedColumnNames(st)
case _ =>
}
}
}
}

// --------------------------------------------------------------------------
// Databases
// --------------------------------------------------------------------------
Expand Down Expand Up @@ -260,8 +214,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
val table = tableDefinition.identifier.table
requireDbExists(db)
verifyTableProperties(tableDefinition)
verifyDataSchema(
tableDefinition.identifier, tableDefinition.tableType, tableDefinition.dataSchema)

if (tableExists(db, table) && !ignoreIfExists) {
throw new TableAlreadyExistsException(db = db, table = table)
Expand Down Expand Up @@ -711,7 +663,6 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
newDataSchema: StructType): Unit = withClient {
requireTableExists(db, table)
val oldTable = getTable(db, table)
verifyDataSchema(oldTable.identifier, oldTable.tableType, newDataSchema)
val schemaProps =
tableMetaToTableProps(oldTable, StructType(newDataSchema ++ oldTable.partitionSchema)).toMap

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException}
import org.apache.spark.sql.connector.catalog.CatalogManager
import org.apache.spark.sql.connector.catalog.CatalogManager.SESSION_CATALOG_NAME
import org.apache.spark.sql.connector.catalog.SupportsNamespaces.PROP_OWNER
import org.apache.spark.sql.errors.DataTypeErrors.toSQLType
import org.apache.spark.sql.execution.command.{DDLSuite, DDLUtils}
import org.apache.spark.sql.execution.datasources.orc.OrcCompressionCodec
import org.apache.spark.sql.execution.datasources.parquet.{ParquetCompressionCodec, ParquetFooterReader}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils}
import org.apache.spark.sql.hive.HiveUtils.{CONVERT_METASTORE_ORC, CONVERT_METASTORE_PARQUET}
import org.apache.spark.sql.hive.orc.OrcFileOperator
Expand Down Expand Up @@ -2876,22 +2876,39 @@ class HiveDDLSuite
}
}

test("SPARK-24681 checks if nested column names do not include ',', ':', and ';'") {
Seq("nested,column", "nested:column", "nested;column").foreach { nestedColumnName =>
test("SPARK-47101 checks if nested column names do not include invalid characters") {
// delimiter characters
Seq(",", ":").foreach { c =>
val typ = s"array<struct<`abc${c}xyz`:int>>"
// The regex is from HiveClientImpl.getSparkSQLDataType, please keep them in sync.
val replaced = typ.replaceAll("`", "").replaceAll("(?<=struct<|,)([^,<:]+)(?=:)", "`$1`")
withTable("t") {
checkError(
exception = intercept[SparkException] {
sql(s"CREATE TABLE t (a $typ) USING hive")
},
errorClass = "CANNOT_RECOGNIZE_HIVE_TYPE",
parameters = Map(
"fieldType" -> toSQLType(replaced),
"fieldName" -> "`a`")
)
}
}
// other special characters
Seq(";", "^", "\\", "/", "%").foreach { c =>
val typ = s"array<struct<`abc${c}xyz`:int>>"
val replaced = typ.replaceAll("`", "")
val msg = s"java.lang.IllegalArgumentException: Error: : expected at the position " +
s"16 of '$replaced' but '$c' is found."
withTable("t") {
checkError(
exception = intercept[AnalysisException] {
spark.range(1)
.select(struct(lit(0).as(nestedColumnName)).as("toplevel"))
.write
.format("hive")
.saveAsTable("t")
sql(s"CREATE TABLE t (a $typ) USING hive")
},
errorClass = "INVALID_HIVE_COLUMN_NAME",
errorClass = "_LEGACY_ERROR_TEMP_3065",
parameters = Map(
"invalidChars" -> "',', ':', ';'",
"tableName" -> "`spark_catalog`.`default`.`t`",
"columnName" -> s"`$nestedColumnName`")
"clazz" -> "org.apache.hadoop.hive.ql.metadata.HiveException",
"msg" -> msg)
)
}
}
Expand Down Expand Up @@ -3385,24 +3402,11 @@ class HiveDDLSuite
}
}

test("SPARK-44911: Create the table with invalid column") {
test("SPARK-47101: comma is allowed in column name") {
val tbl = "t1"
withTable(tbl) {
val e = intercept[AnalysisException] {
sql(
s"""
|CREATE TABLE t1
|STORED AS parquet
|SELECT id, DATE'2018-01-01' + MAKE_DT_INTERVAL(0, id) FROM RANGE(0, 10)
""".stripMargin)
}
checkError(e,
errorClass = "INVALID_HIVE_COLUMN_NAME",
parameters = Map(
"invalidChars" -> "','",
"tableName" -> "`spark_catalog`.`default`.`t1`",
"columnName" -> "`DATE '2018-01-01' + make_dt_interval(0, id, 0, 0`.`000000)`")
)
sql("CREATE TABLE t1 STORED AS parquet SELECT id as `a,b` FROM range(1)")
checkAnswer(sql("SELECT * FROM t1"), Row(0))
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2141,41 +2141,6 @@ abstract class SQLQuerySuiteBase extends QueryTest with SQLTestUtils with TestHi
}
}

test("Auto alias construction of get_json_object") {
val df = Seq(("1", """{"f1": "value1", "f5": 5.23}""")).toDF("key", "jstring")

withTable("t") {
val e = intercept[AnalysisException] {
df.select($"key", functions.get_json_object($"jstring", "$.f1"))
.write.format("hive").saveAsTable("t")
}
checkError(e,
errorClass = "INVALID_HIVE_COLUMN_NAME",
parameters = Map(
"invalidChars" -> "','",
"tableName" -> "`spark_catalog`.`default`.`t`",
"columnName" -> "`get_json_object(jstring, $`.`f1)`")
)
}

withTempView("tempView") {
withTable("t") {
df.createTempView("tempView")
val e = intercept[AnalysisException] {
sql("CREATE TABLE t USING hive AS " +
"SELECT key, get_json_object(jstring, '$.f1') FROM tempView")
}
checkError(e,
errorClass = "INVALID_HIVE_COLUMN_NAME",
parameters = Map(
"invalidChars" -> "','",
"tableName" -> "`spark_catalog`.`default`.`t`",
"columnName" -> "`get_json_object(jstring, $`.`f1)`")
)
}
}
}

test("SPARK-19912 String literals should be escaped for Hive metastore partition pruning") {
withTable("spark_19912") {
Seq(
Expand Down

0 comments on commit 7f82682

Please sign in to comment.