-
Notifications
You must be signed in to change notification settings - Fork 28.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-47210][SQL] Addition of implicit casting without indeterminate support #45383
Changes from 74 commits
b34544a
fdbfa44
ce9b027
e537190
b5a79c1
8321d0c
d178233
a4b9be7
a6e7662
e1d7ad5
b3b1356
7773d13
198a728
255b1ab
9ce417f
50f3aa2
ca0c84d
880a1b1
56d6c7c
94e5259
ccb52ba
9b1387b
c9974e1
fca9a65
660d664
c8edd93
a91490b
49a8d61
4c4cd84
66122a6
50f46e4
cc86a87
c01e80c
cc797a2
5d001ee
c68fc7d
1c926ab
dec39bf
e808446
ca1a23a
116931c
af487a2
4ba7055
e490e42
00e88e7
85b4d16
30f7225
e89a354
788dc06
75c0140
2918413
f6ed55a
98960c0
de623c8
a92b4e1
f7f3011
f67808e
815ce42
7fca38a
b19b0eb
a111f03
55bdd9b
a7228be
27a72c6
18ada04
38670af
01d891e
c5daf86
9ac5678
0f1757d
506c8c0
f743cf8
4f8fe1d
52bf4dc
7cbeafe
3e92e92
b23e106
880ebed
f96ecd9
e1e0cf4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
--- | ||
layout: global | ||
title: COLLATION_MISMATCH error class | ||
displayTitle: COLLATION_MISMATCH error class | ||
license: | | ||
Licensed to the Apache Software Foundation (ASF) under one or more | ||
contributor license agreements. See the NOTICE file distributed with | ||
this work for additional information regarding copyright ownership. | ||
The ASF licenses this file to You under the Apache License, Version 2.0 | ||
(the "License"); you may not use this file except in compliance with | ||
the License. You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
--- | ||
|
||
<!-- | ||
DO NOT EDIT THIS FILE. | ||
It was generated automatically by `org.apache.spark.SparkThrowableSuite`. | ||
--> | ||
|
||
[SQLSTATE: 42P21](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) | ||
|
||
Could not determine which collation to use for string functions and operators. | ||
|
||
This error class has the following derived error classes: | ||
|
||
## EXPLICIT | ||
|
||
Error occurred due to the mismatch between explicit collations: `<explicitTypes>`. Decide on a single explicit collation and remove others. | ||
|
||
## IMPLICIT | ||
|
||
Error occurred due to the mismatch between multiple implicit non-default collations. Use COLLATE function to set the collation explicitly. | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.sql.catalyst.analysis | ||
|
||
import javax.annotation.Nullable | ||
|
||
import scala.annotation.tailrec | ||
|
||
import org.apache.spark.sql.catalyst.analysis.TypeCoercion.{hasStringType} | ||
import org.apache.spark.sql.catalyst.expressions.{ArrayJoin, BinaryExpression, CaseWhen, Cast, Coalesce, Collate, Concat, ConcatWs, CreateArray, Expression, Greatest, If, In, InSubquery, Least, Substring} | ||
import org.apache.spark.sql.errors.QueryCompilationErrors | ||
import org.apache.spark.sql.internal.SQLConf | ||
import org.apache.spark.sql.types.{AbstractDataType, ArrayType, DataType, StringType} | ||
|
||
object CollationTypeCasts extends TypeCoercionRule { | ||
override val transform: PartialFunction[Expression, Expression] = { | ||
case e if !e.childrenResolved => e | ||
case sc@(_: In | ||
| _: InSubquery | ||
| _: CreateArray | ||
| _: If | ||
| _: ArrayJoin | ||
| _: CaseWhen | ||
| _: Concat | ||
| _: Greatest | ||
| _: Least | ||
| _: Coalesce | ||
| _: BinaryExpression | ||
| _: ConcatWs | ||
| _: Substring) => | ||
val newChildren = collateToSingleType(sc.children) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the tricky part of adding a new rule. We must make sure we follow the behavior of existing implicit cast rules and only add implicit cast for certain children of an expression. For example, the true and false branches of Please carefully check all implicit cast rules and revisit this new rule. @mihailom-db There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This rule is only touching children that have StringType's as their DataType. I can reorder execution to first filter out StringType arguments and then work on them, Agreed for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's not assume thar it's safe to only cast string-type inputs for any expression. Please follow the existing implicit cast rules and explicitly match expressions that need some of their inputs to be the same type. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the problem. If I understood correctly, we do not want to replicate the code from ImplicitTypeCasts. This rule is concerned with transforming datatypes into their expected DataTypes, but CollationTypeCasts is concerned with transforming StringTypes into their expected collated StringType. There is a difference, as collated StringType is calculated based on expression parameters that have StringTypes not on what someone expects the input types to be. As you mentioned for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I don't think this is possible without significant refactoring. Correctness is more important than duplicated code at this stage. |
||
sc.withNewChildren(newChildren) | ||
} | ||
/** | ||
* Extracts StringTypes from filtered hasStringType | ||
*/ | ||
@tailrec | ||
private def extractStringType(dt: DataType): StringType = dt match { | ||
case st: StringType => st | ||
case ArrayType(et, _) => extractStringType(et) | ||
} | ||
|
||
/** | ||
* Casts given expression to collated StringType with id equal to collationId only | ||
* if expression has StringType in the first place. | ||
* @param expr | ||
* @param collationId | ||
* @return | ||
*/ | ||
def castStringType(expr: Expression, st: StringType): Option[Expression] = | ||
castStringType(expr.dataType, st).map { dt => Cast(expr, dt)} | ||
|
||
private def castStringType(inType: AbstractDataType, castType: StringType): Option[DataType] = { | ||
@Nullable val ret: DataType = inType match { | ||
case st: StringType if st.collationId != castType.collationId => castType | ||
case ArrayType(arrType, nullable) => | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I disagree with special-case array type. The code looks broken. It assumes the children of the given expression can have both string type and array of string type, then tries to find a common collation between the string type child and the array element. This makes no sense without knowing the semantic of the given expression. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A simple example is ConcatWs. It can have ArrayType(StringType, _) for input strings and StringType for separator as parameters. What collations do we want for this then? We need to cast the ArrayType into a proper collation if separator is explicit. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then please match the ConcatWs expression explicitly to handle this case. What I disagree with is to do this blindly for all expressions. |
||
castStringType(arrType, castType).map(ArrayType(_, nullable)).orNull | ||
case _ => null | ||
} | ||
Option(ret) | ||
} | ||
|
||
/** | ||
* Collates input expressions to a single collation. | ||
*/ | ||
def collateToSingleType(exprs: Seq[Expression]): Seq[Expression] = { | ||
val st = getOutputCollation(exprs) | ||
|
||
exprs.map(e => castStringType(e, st).getOrElse(e)) | ||
} | ||
|
||
/** | ||
* Based on the data types of the input expressions this method determines | ||
* a collation type which the output will have. This function accepts Seq of | ||
* any expressions, but will only be affected by collated StringTypes or | ||
* complex DataTypes with collated StringTypes (e.g. ArrayType) | ||
*/ | ||
def getOutputCollation(expr: Seq[Expression]): StringType = { | ||
val explicitTypes = expr.filter(_.isInstanceOf[Collate]) | ||
.map(_.dataType.asInstanceOf[StringType].collationId) | ||
.distinct | ||
|
||
explicitTypes.size match { | ||
// We have 1 explicit collation | ||
case 1 => StringType(explicitTypes.head) | ||
// Multiple explicit collations occurred | ||
case size if size > 1 => | ||
throw QueryCompilationErrors | ||
.explicitCollationMismatchError( | ||
explicitTypes.map(t => StringType(t).typeName) | ||
) | ||
// Only implicit or default collations present | ||
case 0 => | ||
val implicitTypes = expr.map(_.dataType) | ||
.filter(hasStringType) | ||
.map(extractStringType) | ||
cloud-fan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
.filter(dt => dt.collationId != SQLConf.get.defaultStringType.collationId) | ||
cloud-fan marked this conversation as resolved.
Show resolved
Hide resolved
|
||
.distinctBy(_.collationId) | ||
|
||
if (hasMultipleImplicits(implicitTypes)) { | ||
mihailom-db marked this conversation as resolved.
Show resolved
Hide resolved
|
||
throw QueryCompilationErrors.implicitCollationMismatchError() | ||
} | ||
else { | ||
implicitTypes.headOption.getOrElse(SQLConf.get.defaultStringType) | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* This check is always preformed when we have no explicit collation. It returns true | ||
* if there are more than one implicit collations. Collations are distinguished by their | ||
* collationId. | ||
* @param dataTypes | ||
* @return | ||
*/ | ||
private def hasMultipleImplicits(dataTypes: Seq[StringType]): Boolean = | ||
dataTypes.map(_.collationId) | ||
.filter(dt => !(dt == SQLConf.get.defaultStringType.collationId)).distinct.size > 1 | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should add more case matches as some expressions do not require all its inputs to be the same type