Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-47296][SQL][COLLATION] Fail unsupported functions for non-binary collations #45422

Closed
wants to merge 29 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
e0485fc
Fail all 10 regexp expressions for non-binary collations
uros-db Mar 7, 2024
355eee9
Reformat code
uros-db Mar 8, 2024
cde9eff
Refactoring and better test coverage
uros-db Mar 12, 2024
616335b
Lockdown using StringType
uros-db Mar 13, 2024
e6bec45
Merge branch 'apache:master' into regexp-functions
uros-db Mar 13, 2024
77e606c
Lockdown for regexpExpressions
uros-db Mar 13, 2024
ee72ed3
Renaming and small fixes
uros-db Mar 13, 2024
8bc10f5
Code style fixes
uros-db Mar 13, 2024
8ee0f58
Move StringType lockdown condition to case object
uros-db Mar 13, 2024
d3a9e70
Fix checkInputDataTypes
uros-db Mar 14, 2024
c11c458
Implicit cast for new StringTypes
uros-db Mar 14, 2024
4778951
Implicit cast for new StringTypes
uros-db Mar 14, 2024
1582f24
Separate suite for stringExpressions
uros-db Mar 14, 2024
8d75846
Collation test fix
uros-db Mar 14, 2024
63caef6
Reformat
uros-db Mar 15, 2024
38c3906
Remove extra changes
uros-db Mar 15, 2024
1d60cef
Rewrite collation mismatch check
uros-db Mar 15, 2024
2f5aba9
Improve lockdown handling
uros-db Mar 15, 2024
b9b185d
Add ANSI type coercion
uros-db Mar 15, 2024
3fed6cb
Better ANSI type coercion explanation
uros-db Mar 15, 2024
638c6d4
Fix type coercion
uros-db Mar 15, 2024
af7611f
Fix ANSI type coercion
uros-db Mar 17, 2024
ceabb4a
Merge branch 'apache:master' into regexp-functions
uros-db Mar 17, 2024
0f4a6b1
Fix failing tests
mihailom-db Mar 19, 2024
9b24fba
Incorporate requested changes
mihailom-db Mar 19, 2024
831197a
Merge branch 'apache:master' into regexp-functions
mihailom-db Mar 20, 2024
99b36b0
Refactor code and fix comments
mihailom-db Mar 20, 2024
9051923
Fix newlines
mihailom-db Mar 20, 2024
6986b8b
Add back newlines
mihailom-db Mar 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ public static class Collation {
* byte for byte equal. All accent or case-insensitive collations are considered non-binary.
*/
public final boolean isBinaryCollation;
public final boolean isLowercaseCollation;
uros-db marked this conversation as resolved.
Show resolved Hide resolved

public Collation(
String collationName,
Expand All @@ -79,6 +80,7 @@ public Collation(
this.version = version;
this.hashFunction = hashFunction;
this.isBinaryCollation = isBinaryCollation;
this.isLowercaseCollation = collationName.equals("UCS_BASIC_LCASE");

if (isBinaryCollation) {
this.equalsFunction = UTF8String::equals;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkException
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.DataTypeMismatch
import org.apache.spark.sql.catalyst.util.CollationFactory
import org.apache.spark.sql.types.{DataType, StringType}

object CollationUtils {
uros-db marked this conversation as resolved.
Show resolved Hide resolved
def checkCollationCompatibility(
superCheck: => TypeCheckResult,
uros-db marked this conversation as resolved.
Show resolved Hide resolved
collationId: Int,
rightDataType: DataType
): TypeCheckResult = {
val checkResult = superCheck
if (checkResult.isFailure) return checkResult
// Additional check needed for collation compatibility
val rightCollationId: Int = rightDataType.asInstanceOf[StringType].collationId
if (collationId != rightCollationId) {
return DataTypeMismatch(
errorSubClass = "COLLATION_MISMATCH",
messageParameters = Map(
"collationNameLeft" -> CollationFactory.fetchCollation(collationId).collationName,
"collationNameRight" -> CollationFactory.fetchCollation(rightCollationId).collationName
)
)
}
TypeCheckResult.TypeCheckSuccess
}

final val SUPPORT_BINARY_ONLY: Int = 0
uros-db marked this conversation as resolved.
Show resolved Hide resolved
final val SUPPORT_LOWERCASE: Int = 1
final val SUPPORT_ALL_COLLATIONS: Int = 2

def checkCollationSupport(
uros-db marked this conversation as resolved.
Show resolved Hide resolved
superCheck: => TypeCheckResult,
collationId: Int,
functionName: String,
supportLevel: Int = SUPPORT_BINARY_ONLY
): TypeCheckResult = {
val checkResult = superCheck
if (checkResult.isFailure) return checkResult
// Additional check needed for collation support
val collation = CollationFactory.fetchCollation(collationId)
supportLevel match {
case SUPPORT_BINARY_ONLY =>
if (!collation.isBinaryCollation) {
throwUnsupportedCollation(functionName, collation.collationName)
}
case SUPPORT_LOWERCASE =>
if (!collation.isBinaryCollation && !collation.isLowercaseCollation) {
throwUnsupportedCollation(functionName, collation.collationName)
}
case SUPPORT_ALL_COLLATIONS => // No additional checks needed
case _ => throw new IllegalArgumentException("Invalid collation support level.")
}
TypeCheckResult.TypeCheckSuccess
}

private def throwUnsupportedCollation(functionName: String, collationName: String): Unit = {
throw new SparkException(
errorClass = "UNSUPPORTED_COLLATION.FOR_FUNCTION",
messageParameters = Map(
"functionName" -> functionName,
"collationName" -> collationName),
cause = null
)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ abstract class StringRegexExpression extends BinaryExpression

override def inputTypes: Seq[DataType] = Seq(StringType, StringType)

final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId

override def checkInputDataTypes(): TypeCheckResult = {
CollationUtils.checkCollationCompatibility(
super.checkInputDataTypes(), collationId, right.dataType)
}

// try cache foldable pattern
private lazy val cache: Pattern = right match {
case p: Expression if p.foldable =>
Expand Down Expand Up @@ -130,6 +137,11 @@ abstract class StringRegexExpression extends BinaryExpression
case class Like(left: Expression, right: Expression, escapeChar: Char)
extends StringRegexExpression {

override def checkInputDataTypes(): TypeCheckResult = {
CollationUtils.checkCollationSupport(
super.checkInputDataTypes(), collationId, "like", CollationUtils.SUPPORT_BINARY_ONLY)
}

def this(left: Expression, right: Expression) = this(left, right, '\\')

override def escape(v: String): String = StringUtils.escapeLikeRegex(v, escapeChar)
Expand Down Expand Up @@ -260,6 +272,15 @@ case class ILike(

override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)

final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId

override def checkInputDataTypes(): TypeCheckResult = {
CollationUtils.checkCollationCompatibility(
super.checkInputDataTypes(), collationId, right.dataType)
CollationUtils.checkCollationSupport(
super.checkInputDataTypes(), collationId, "ilike", CollationUtils.SUPPORT_BINARY_ONLY)
}

override protected def withNewChildrenInternal(
newLeft: Expression, newRight: Expression): Expression = {
copy(left = newLeft, right = newRight)
Expand Down Expand Up @@ -461,6 +482,11 @@ case class NotLikeAny(child: Expression, patterns: Seq[UTF8String]) extends Like
// scalastyle:on line.contains.tab line.size.limit
case class RLike(left: Expression, right: Expression) extends StringRegexExpression {

override def checkInputDataTypes(): TypeCheckResult = {
CollationUtils.checkCollationSupport(
super.checkInputDataTypes(), collationId, "rlike", CollationUtils.SUPPORT_BINARY_ONLY)
}

override def escape(v: String): String = v
override def matches(regex: Pattern, str: String): Boolean = regex.matcher(str).find(0)
override def toString: String = s"RLIKE($left, $right)"
Expand Down Expand Up @@ -545,6 +571,16 @@ case class StringSplit(str: Expression, regex: Expression, limit: Expression)

override def dataType: DataType = ArrayType(StringType, containsNull = false)
override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType)

final lazy val collationId: Int = first.dataType.asInstanceOf[StringType].collationId

override def checkInputDataTypes(): TypeCheckResult = {
CollationUtils.checkCollationCompatibility(
super.checkInputDataTypes(), collationId, second.dataType)
CollationUtils.checkCollationSupport(
super.checkInputDataTypes(), collationId, prettyName, CollationUtils.SUPPORT_BINARY_ONLY)
}

override def first: Expression = str
override def second: Expression = regex
override def third: Expression = limit
Expand Down Expand Up @@ -614,6 +650,8 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
def this(subject: Expression, regexp: Expression, rep: Expression) =
this(subject, regexp, rep, Literal(1))

final lazy val collationId: Int = first.dataType.asInstanceOf[StringType].collationId

override def checkInputDataTypes(): TypeCheckResult = {
val defaultCheck = super.checkInputDataTypes()
if (defaultCheck.isFailure) {
Expand Down Expand Up @@ -643,6 +681,11 @@ case class RegExpReplace(subject: Expression, regexp: Expression, rep: Expressio
)
)
}

CollationUtils.checkCollationCompatibility(
super.checkInputDataTypes(), collationId, second.dataType)
CollationUtils.checkCollationSupport(
super.checkInputDataTypes(), collationId, prettyName, CollationUtils.SUPPORT_BINARY_ONLY)
}

// last regex in string, we will update the pattern iff regexp value changed.
Expand Down Expand Up @@ -772,6 +815,13 @@ abstract class RegExpExtractBase
final override val nodePatterns: Seq[TreePattern] = Seq(REGEXP_EXTRACT_FAMILY)

override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, IntegerType)

final lazy val collationId: Int = subject.dataType.asInstanceOf[StringType].collationId

override def checkInputDataTypes(): TypeCheckResult = {
CollationUtils.checkCollationCompatibility(
super.checkInputDataTypes(), collationId, regexp.dataType)
}
override def first: Expression = subject
override def second: Expression = regexp
override def third: Expression = idx
Expand Down Expand Up @@ -849,6 +899,12 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio
}

override def dataType: DataType = StringType

override def checkInputDataTypes(): TypeCheckResult = {
CollationUtils.checkCollationSupport(
super.checkInputDataTypes(), collationId, prettyName, CollationUtils.SUPPORT_BINARY_ONLY)
}

override def prettyName: String = "regexp_extract"

override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
Expand Down Expand Up @@ -948,6 +1004,11 @@ case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: Expres
}

override def dataType: DataType = ArrayType(StringType)

override def checkInputDataTypes(): TypeCheckResult = {
CollationUtils.checkCollationSupport(
super.checkInputDataTypes(), collationId, prettyName, CollationUtils.SUPPORT_BINARY_ONLY)
}
override def prettyName: String = "regexp_extract_all"

override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
Expand Down Expand Up @@ -1022,6 +1083,15 @@ case class RegExpCount(left: Expression, right: Expression)

override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)

final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId

override def checkInputDataTypes(): TypeCheckResult = {
CollationUtils.checkCollationCompatibility(
super.checkInputDataTypes(), collationId, right.dataType)
CollationUtils.checkCollationSupport(
super.checkInputDataTypes(), collationId, prettyName, CollationUtils.SUPPORT_BINARY_ONLY)
}

override protected def withNewChildrenInternal(
newChildren: IndexedSeq[Expression]): RegExpCount =
copy(left = newChildren(0), right = newChildren(1))
Expand Down Expand Up @@ -1061,6 +1131,15 @@ case class RegExpSubStr(left: Expression, right: Expression)

override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType)

final lazy val collationId: Int = left.dataType.asInstanceOf[StringType].collationId

override def checkInputDataTypes(): TypeCheckResult = {
CollationUtils.checkCollationCompatibility(
super.checkInputDataTypes(), collationId, right.dataType)
CollationUtils.checkCollationSupport(
super.checkInputDataTypes(), collationId, prettyName, CollationUtils.SUPPORT_BINARY_ONLY)
}

override protected def withNewChildrenInternal(
newChildren: IndexedSeq[Expression]): RegExpSubStr =
copy(left = newChildren(0), right = newChildren(1))
Expand Down Expand Up @@ -1113,6 +1192,11 @@ case class RegExpInStr(subject: Expression, regexp: Expression, idx: Expression)
}

override def dataType: DataType = IntegerType

override def checkInputDataTypes(): TypeCheckResult = {
CollationUtils.checkCollationSupport(
super.checkInputDataTypes(), collationId, prettyName, CollationUtils.SUPPORT_BINARY_ONLY)
}
override def prettyName: String = "regexp_instr"

override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
Expand Down
Loading